diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-03 14:10:23 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-03 14:10:23 +0000 |
| commit | 145449b1e420787bb99721a429341fa6be3adfb6 (patch) | |
| tree | 1d56ae694a6de602e348dd80165cf881a36600ed /llvm/lib/Target/AArch64 | |
| parent | ecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff) | |
Diffstat (limited to 'llvm/lib/Target/AArch64')
81 files changed, 16165 insertions, 11071 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 4d1464901777..a6065d4ed9ec 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -16,6 +16,8 @@ #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" #include "llvm/Support/DataTypes.h" #include "llvm/Target/TargetMachine.h" @@ -71,6 +73,7 @@ void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); void initializeAArch64AdvSIMDScalarPass(PassRegistry&); void initializeAArch64BranchTargetsPass(PassRegistry&); +void initializeAArch64CFIFixupPass(PassRegistry&); void initializeAArch64CollectLOHPass(PassRegistry&); void initializeAArch64CondBrTuningPass(PassRegistry &); void initializeAArch64CompressJumpTablesPass(PassRegistry&); diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 9a04b28a8b8f..f092c039b58e 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -64,6 +64,10 @@ def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true", "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">; +def FeatureLDAPR : SubtargetFeature<"ldapr", "HasLDAPR", "true", + "Use LDAPR to lower atomic loads; experimental until we " + "have more testing/a formal correctness proof">; + def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", "Enable out of line atomics to support LSE instructions">; @@ -154,6 +158,10 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", "Has zero-cycle zeroing instructions for generic registers">; +// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0". +// as movi is more efficient across all cores. Newer cores can eliminate +// fmovs early and there is no difference with movi, but this not true for +// all implementations. def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false", "Has no zero-cycle zeroing instructions for FP registers">; @@ -168,7 +176,7 @@ def FeatureZCZeroingFPWorkaround : SubtargetFeature<"zcz-fp-workaround", "The zero-cycle floating-point zeroing instruction has a bug">; def FeatureStrictAlign : SubtargetFeature<"strict-align", - "StrictAlign", "true", + "RequiresStrictAlign", "true", "Disallow all unaligned memory " "access">; @@ -190,11 +198,11 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature< "Prefer likely predicted branches over selects">; def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", - "CustomAsCheapAsMove", "true", + "HasCustomCheapAsMoveHandling", "true", "Use custom handling of cheap instructions">; def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move", - "ExynosAsCheapAsMove", "true", + "HasExynosCheapAsMoveHandling", "true", "Use Exynos specific handling of cheap instructions", [FeatureCustomCheapAsMoveHandling]>; @@ -202,12 +210,16 @@ def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", - "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; + "IsMisaligned128StoreSlow", "true", "Misaligned 128 bit stores are slow">; def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", - "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">; + "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">; + +def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address", + "IsStoreAddressAscend", "false", + "Schedule vector stores by ascending address">; -def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow", +def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow", "true", "STR of Q register with register offset is slow">; def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< @@ -246,6 +258,10 @@ def FeatureFuseCryptoEOR : SubtargetFeature< "fuse-crypto-eor", "HasFuseCryptoEOR", "true", "CPU fuses AES/PMULL and EOR operations">; +def FeatureFuseAdrpAdd : SubtargetFeature< + "fuse-adrp-add", "HasFuseAdrpAdd", "true", + "CPU fuses adrp+add operations">; + def FeatureFuseLiterals : SubtargetFeature< "fuse-literals", "HasFuseLiterals", "true", "CPU fuses literal generation operations">; @@ -438,13 +454,8 @@ def FeatureEnhancedCounterVirtualization : def FeatureRME : SubtargetFeature<"rme", "HasRME", "true", "Enable Realm Management Extension">; -// A subset of SVE(2) instructions are legal in Streaming SVE execution mode -// defined by SME. -def FeatureStreamingSVE : SubtargetFeature<"streaming-sve", - "HasStreamingSVE", "true", - "Enable subset of SVE(2) instructions for Streaming SVE execution mode">; def FeatureSME : SubtargetFeature<"sme", "HasSME", "true", - "Enable Scalable Matrix Extension (SME)", [FeatureStreamingSVE, FeatureBF16]>; + "Enable Scalable Matrix Extension (SME)", [FeatureBF16, FeatureUseScalarIncVL]>; def FeatureSMEF64 : SubtargetFeature<"sme-f64", "HasSMEF64", "true", "Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>; @@ -464,6 +475,11 @@ def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true", def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769", "FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">; +def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice", + "NoBTIAtReturnTwice", "true", + "Don't place a BTI instruction " + "after a return-twice">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -534,7 +550,18 @@ def HasV8_0rOps : SubtargetFeature< FeaturePAuth, FeatureRCPC, //v8.4 FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI, - FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO]>; + FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO, + // Not mandatory in v8.0-R, but included here on the grounds that it + // only enables names of system registers + FeatureSpecRestrict + ]>; + +// Only intended to be used by disassemblers. +def FeatureAll + : SubtargetFeature<"all", "IsAll", "true", "Enable all instructions", []>; + +class AssemblerPredicateWithAll<dag cond, string name=""> + : AssemblerPredicate<(any_of FeatureAll, cond), name>; //===----------------------------------------------------------------------===// // Register File Description @@ -552,6 +579,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64SchedPredAmpere.td" include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; @@ -596,7 +624,7 @@ class AArch64Unsupported { list<Predicate> F; } def SVEUnsupported : AArch64Unsupported { let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, - HasSVE2BitPerm, HasSVEorStreamingSVE, HasSVE2orStreamingSVE]; + HasSVE2BitPerm, HasSVEorSME, HasSVE2orSME]; } def PAUnsupported : AArch64Unsupported { @@ -621,6 +649,7 @@ include "AArch64SchedThunderX2T99.td" include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" +include "AArch64SchedAmpere1.td" def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors">; @@ -649,6 +678,7 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", FeatureFuseAES, FeatureBalanceFPOps, FeatureCustomCheapAsMoveHandling, + FeatureFuseAdrpAdd, FeatureFuseLiterals, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive]>; @@ -657,11 +687,13 @@ def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", "Cortex-A65 ARM processors", [ FeatureFuseAES, FeatureFuseAddress, + FeatureFuseAdrpAdd, FeatureFuseLiterals]>; def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", [ FeatureFuseAES, + FeatureFuseAdrpAdd, FeatureFuseLiterals]>; def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", @@ -802,6 +834,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureFuseArithmeticLogic, FeatureFuseCCSelect, FeatureFuseCryptoEOR, + FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureZCRegMove, FeatureZCZeroing]>; @@ -813,13 +846,15 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", FeatureFuseAddress, FeatureFuseAES, FeatureFuseCCSelect, + FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureLSLFast, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive]>; -def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", - "Samsung Exynos-M3 processors", +// Re-uses some scheduling and tunings from the ExynosM3 proc family. +def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3", + "Samsung Exynos-M4 processors", [FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, FeatureExynosCheapAsMoveHandling, @@ -828,6 +863,7 @@ def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", FeatureFuseAES, FeatureFuseArithmeticLogic, FeatureFuseCCSelect, + FeatureFuseAdrpAdd, FeatureFuseLiterals, FeatureLSLFast, FeaturePostRAScheduler, @@ -934,6 +970,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110", FeatureFuseAES, FeaturePostRAScheduler]>; +def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", + "Ampere Computing Ampere-1 processors", [ + FeaturePostRAScheduler, + FeatureFuseAES, + FeatureLSLFast, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeatureCmpBccFusion, + FeatureFuseAddress, + FeatureFuseLiterals]>; def ProcessorFeatures { list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, @@ -947,13 +993,14 @@ def ProcessorFeatures { FeatureFP16FML]; list<SubtargetFeature> A65 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, - FeatureRCPC, FeatureSSBS, FeatureRAS]; + FeatureRCPC, FeatureSSBS, FeatureRAS, + FeaturePerfMon]; list<SubtargetFeature> A76 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, - FeatureRCPC, FeatureSSBS]; + FeatureRCPC, FeatureSSBS, FeaturePerfMon]; list<SubtargetFeature> A77 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, - FeatureRCPC]; + FeatureRCPC, FeaturePerfMon, FeatureSSBS]; list<SubtargetFeature> A78 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, FeatureRCPC, FeaturePerfMon, FeatureSPE, @@ -968,14 +1015,15 @@ def ProcessorFeatures { FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8]; list<SubtargetFeature> R82 = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16, FeatureFP16FML, FeatureSSBS, FeaturePredRes, - FeatureSB, FeatureSpecRestrict]; + FeatureSB]; list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureRCPC, FeaturePerfMon, - FeatureSPE, FeatureFullFP16, FeatureDotProd]; + FeatureSPE, FeatureFullFP16, FeatureDotProd, + FeatureSSBS]; list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureRCPC, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureDotProd, - FeaturePAuth]; + FeaturePAuth, FeatureSSBS]; list<SubtargetFeature> X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, FeatureMTE, FeatureETE, FeatureSVE2BitPerm, @@ -1012,13 +1060,15 @@ def ProcessorFeatures { FeatureRDM]; list<SubtargetFeature> NeoverseE1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureNEON, - FeatureRCPC, FeatureSSBS]; + FeatureRCPC, FeatureSSBS, FeaturePerfMon]; list<SubtargetFeature> NeoverseN1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, FeatureNEON, - FeatureRCPC, FeatureSPE, FeatureSSBS]; + FeatureRCPC, FeatureSPE, FeatureSSBS, + FeaturePerfMon]; list<SubtargetFeature> NeoverseN2 = [HasV8_5aOps, FeatureBF16, FeatureETE, FeatureMatMulInt8, FeatureMTE, FeatureSVE2, - FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto]; + FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto, + FeaturePerfMon]; list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist, FeatureCrypto, FeatureFPARMv8, FeatureFP16FML, FeatureFullFP16, FeatureMatMulInt8, FeatureNEON, @@ -1041,17 +1091,20 @@ def ProcessorFeatures { list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureFP16FML, FeatureDotProd]; + list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, + FeatureMTE, FeatureSSBS]; // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not // affect code generated by the compiler and can be used only by explicitly // mentioning the new system register names in assembly. - list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureETE]; + list<SubtargetFeature> Generic = [FeatureFPARMv8, FeatureNEON, FeatureETE]; } - +// FeatureFuseAdrpAdd is enabled under Generic to allow linker merging +// optimizations. def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic, - [FeatureFuseAES, FeaturePostRAScheduler]>; + [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler]>; def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53, [TuneA35]>; def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53, @@ -1178,6 +1231,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX, def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel, [TuneCarmel]>; +// Ampere Computing +def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, + [TuneAmpere1]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index 4cdf5f144437..37a65b64a885 100644 --- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -223,6 +223,7 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) { if (isFirstInstructionInSequence(PrevInstr) && isSecondInstructionInSequence(CurrInstr)) { LLVM_DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n"); + (void) Idx; Sequences.push_back(CurrInstr); } } diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index b54a0eaba7d1..ef4860979dd3 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -132,7 +132,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override { AArch64FI = MF.getInfo<AArch64FunctionInfo>(); - STI = static_cast<const AArch64Subtarget*>(&MF.getSubtarget()); + STI = &MF.getSubtarget<AArch64Subtarget>(); SetupMachineFunction(MF); @@ -143,10 +143,10 @@ public: int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT; - OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); - OutStreamer->EmitCOFFSymbolStorageClass(Scl); - OutStreamer->EmitCOFFSymbolType(Type); - OutStreamer->EndCOFFSymbolDef(); + OutStreamer->beginCOFFSymbolDef(CurrentFnSym); + OutStreamer->emitCOFFSymbolStorageClass(Scl); + OutStreamer->emitCOFFSymbolType(Type); + OutStreamer->endCOFFSymbolDef(); } // Emit the rest of the function body. @@ -204,10 +204,10 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { // Emit an absolute @feat.00 symbol. This appears to be some kind of // compiler features bitfield read by link.exe. MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00")); - OutStreamer->BeginCOFFSymbolDef(S); - OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); - OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); - OutStreamer->EndCOFFSymbolDef(); + OutStreamer->beginCOFFSymbolDef(S); + OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); + OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); + OutStreamer->endCOFFSymbolDef(); int64_t Feat00Flags = 0; if (M.getModuleFlag("cfguard")) { @@ -251,7 +251,7 @@ void AArch64AsmPrinter::emitFunctionHeaderComment() { const AArch64FunctionInfo *FI = MF->getInfo<AArch64FunctionInfo>(); Optional<std::string> OutlinerString = FI->getOutliningStyle(); if (OutlinerString != None) - OutStreamer->GetCommentOS() << ' ' << OutlinerString; + OutStreamer->getCommentOS() << ' ' << OutlinerString; } void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) @@ -378,10 +378,10 @@ void AArch64AsmPrinter::emitHwasanMemaccessSymbols(Module &M) { bool CompileKernel = (AccessInfo >> HWASanAccessInfo::CompileKernelShift) & 1; - OutStreamer->SwitchSection(OutContext.getELFSection( + OutStreamer->switchSection(OutContext.getELFSection( ".text.hot", ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, - Sym->getName(), /*IsComdat=*/true)); + ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName(), + /*IsComdat=*/true)); OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction); OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak); @@ -827,7 +827,7 @@ void AArch64AsmPrinter::emitJumpTableInfo() { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM); - OutStreamer->SwitchSection(ReadOnlySec); + OutStreamer->switchSection(ReadOnlySec); auto AFI = MF->getInfo<AArch64FunctionInfo>(); for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) { @@ -865,7 +865,7 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall || MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall || - STI->getRegisterInfo()->hasSVEArgsOrReturn(MF)) { + MF->getInfo<AArch64FunctionInfo>()->isSVECC()) { auto *TS = static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer()); TS->emitDirectiveVariantPCS(CurrentFnSym); @@ -1129,7 +1129,8 @@ void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) { void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); - if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) { + if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() && + STI->hasNEON()) { // Convert H/S register to corresponding D register if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) DestReg = AArch64::D0 + (DestReg - AArch64::H0); @@ -1262,7 +1263,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { break; case AArch64::DBG_VALUE: - case AArch64::DBG_VALUE_LIST: { + case AArch64::DBG_VALUE_LIST: if (isVerbose() && OutStreamer->hasRawTextSupport()) { SmallString<128> TmpStr; raw_svector_ostream OS(TmpStr); @@ -1282,8 +1283,18 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { OutStreamer->emitCFIBKeyFrame(); return; - } - } + } + + case AArch64::EMITMTETAGGED: { + ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType(); + if (ExceptionHandlingType != ExceptionHandling::DwarfCFI && + ExceptionHandlingType != ExceptionHandling::ARM) + return; + + if (getFunctionCFISectionType(*MF) != CFISection::None) + OutStreamer->emitCFIMTETaggedFrame(); + return; + } // Tail calls use pseudo instructions so they have the proper code-gen // attributes (isCall, isReturn, etc.). We lower them to the real diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index f26151536a58..c0da242a26de 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -82,9 +82,9 @@ def CC_AArch64_AAPCS : CallingConv<[ nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCPassIndirect<i64>>, - CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1], CCAssignToReg<[P0, P1, P2, P3]>>, - CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1], CCPassIndirect<i64>>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, @@ -149,7 +149,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, - CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1], CCAssignToReg<[P0, P1, P2, P3]>> ]>; diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index ac243347b24d..d12689970dc5 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -528,10 +528,8 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) { // count as MultiUser or block optimization. This is especially important on // arm64_32, where any memory operation is likely to be an explicit use of // xN and an implicit use of wN (the base address register). - if (!UsesSeen.count(Idx)) { + if (UsesSeen.insert(Idx).second) handleUse(MI, MO, LOHInfos[Idx]); - UsesSeen.insert(Idx); - } } } @@ -559,7 +557,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { // Walk the basic block backwards and update the per register state machine // in the process. for (const MachineInstr &MI : - instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { + instructionsWithoutDebug(MBB.instr_rbegin(), MBB.instr_rend())) { unsigned Opcode = MI.getOpcode(); switch (Opcode) { case AArch64::ADDXri: diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 1994e0eb7fb9..18c111255e53 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -217,7 +217,7 @@ def AArch64PostLegalizerLoweringHelper // Post-legalization combines which are primarily optimizations. def AArch64PostLegalizerCombinerHelper : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", - [copy_prop, erase_undef_store, combines_for_extload, + [copy_prop, combines_for_extload, sext_trunc_sextload, mutate_anyext_to_zext, hoist_logic_op_with_same_opcode_hands, redundant_and, xor_of_and_with_same_reg, @@ -228,6 +228,6 @@ def AArch64PostLegalizerCombinerHelper select_combines, fold_merge_to_zext, constant_fold, identity_combines, ptr_add_immed_chain, overlapping_and, - split_store_zero_128]> { + split_store_zero_128, undef_combines]> { let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 82e8df3b73f9..343f888b7552 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -247,8 +247,8 @@ void SSACCmpConv::updateTailPHIs() { for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) { // PHI operands are (Reg, MBB) at (oi-2, oi-1). if (I.getOperand(oi - 1).getMBB() == CmpBB) { - I.RemoveOperand(oi - 1); - I.RemoveOperand(oi - 2); + I.removeOperand(oi - 1); + I.removeOperand(oi - 2); } } } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index b0f739cc26e6..910f8cdede75 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -86,6 +86,7 @@ private: unsigned N); bool expandCALL_RVMARKER(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandCALL_BTI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; @@ -759,6 +760,37 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER( return true; } +bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + // Expand CALL_BTI pseudo to: + // - a branch to the call target + // - a BTI instruction + // Mark the sequence as a bundle, to avoid passes moving other code in + // between. + + MachineInstr &MI = *MBBI; + MachineOperand &CallTarget = MI.getOperand(0); + assert((CallTarget.isGlobal() || CallTarget.isReg()) && + "invalid operand for regular call"); + unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR; + MachineInstr *Call = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr(); + Call->addOperand(CallTarget); + + MachineInstr *BTI = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::HINT)) + // BTI J so that setjmp can to BR to this. + .addImm(36) + .getInstr(); + + if (MI.shouldUpdateCallSiteInfo()) + MBB.getParent()->moveCallSiteInfo(&MI, Call); + + MI.eraseFromParent(); + finalizeBundle(MBB, Call->getIterator(), std::next(BTI->getIterator())); + return true; +} + bool AArch64ExpandPseudo::expandStoreSwiftAsyncContext( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { Register CtxReg = MBBI->getOperand(0).getReg(); @@ -1238,6 +1270,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2); case AArch64::BLR_RVMARKER: return expandCALL_RVMARKER(MBB, MBBI); + case AArch64::BLR_BTI: + return expandCALL_BTI(MBB, MBBI); case AArch64::StoreSwiftAsyncContext: return expandStoreSwiftAsyncContext(MBB, MBBI); } diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 793663ef97d7..6de374125466 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -813,7 +813,7 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { } bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { - auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + auto &ST = Fn.getSubtarget<AArch64Subtarget>(); if (ST.getProcFamily() != AArch64Subtarget::Falkor) return false; diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index c67fa62c7a92..49fffa01a974 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -14,6 +14,7 @@ #include "AArch64.h" #include "AArch64CallingConvention.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -282,8 +283,7 @@ public: explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) { - Subtarget = - &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget()); + Subtarget = &FuncInfo.MF->getSubtarget<AArch64Subtarget>(); Context = &FuncInfo.Fn->getContext(); } @@ -3127,6 +3127,13 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (!Callee && !Symbol) return false; + // Allow SelectionDAG isel to handle calls to functions like setjmp that need + // a bti instruction following the call. + if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) && + !Subtarget->noBTIAtReturnTwice() && + MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) + return false; + // Allow SelectionDAG isel to handle tail calls. if (IsTailCall) return false; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index a4d20735e2b1..78babdf9f1f0 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -117,6 +117,72 @@ // // FIXME: also explain the redzone concept. // +// An example of the prologue: +// +// .globl __foo +// .align 2 +// __foo: +// Ltmp0: +// .cfi_startproc +// .cfi_personality 155, ___gxx_personality_v0 +// Leh_func_begin: +// .cfi_lsda 16, Lexception33 +// +// stp xa,bx, [sp, -#offset]! +// ... +// stp x28, x27, [sp, #offset-32] +// stp fp, lr, [sp, #offset-16] +// add fp, sp, #offset - 16 +// sub sp, sp, #1360 +// +// The Stack: +// +-------------------------------------------+ +// 10000 | ........ | ........ | ........ | ........ | +// 10004 | ........ | ........ | ........ | ........ | +// +-------------------------------------------+ +// 10008 | ........ | ........ | ........ | ........ | +// 1000c | ........ | ........ | ........ | ........ | +// +===========================================+ +// 10010 | X28 Register | +// 10014 | X28 Register | +// +-------------------------------------------+ +// 10018 | X27 Register | +// 1001c | X27 Register | +// +===========================================+ +// 10020 | Frame Pointer | +// 10024 | Frame Pointer | +// +-------------------------------------------+ +// 10028 | Link Register | +// 1002c | Link Register | +// +===========================================+ +// 10030 | ........ | ........ | ........ | ........ | +// 10034 | ........ | ........ | ........ | ........ | +// +-------------------------------------------+ +// 10038 | ........ | ........ | ........ | ........ | +// 1003c | ........ | ........ | ........ | ........ | +// +-------------------------------------------+ +// +// [sp] = 10030 :: >>initial value<< +// sp = 10020 :: stp fp, lr, [sp, #-16]! +// fp = sp == 10020 :: mov fp, sp +// [sp] == 10020 :: stp x28, x27, [sp, #-16]! +// sp == 10010 :: >>final value<< +// +// The frame pointer (w29) points to address 10020. If we use an offset of +// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24 +// for w27, and -32 for w28: +// +// Ltmp1: +// .cfi_def_cfa w29, 16 +// Ltmp2: +// .cfi_offset w30, -8 +// Ltmp3: +// .cfi_offset w29, -16 +// Ltmp4: +// .cfi_offset w27, -24 +// Ltmp5: +// .cfi_offset w28, -32 +// //===----------------------------------------------------------------------===// #include "AArch64FrameLowering.h" @@ -126,6 +192,7 @@ #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -154,7 +221,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -187,7 +253,7 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", cl::init(true), cl::Hidden); cl::opt<bool> EnableHomogeneousPrologEpilog( - "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden, + "homogeneous-prolog-epilog", cl::Hidden, cl::desc("Emit homogeneous prologue and epilogue for the size " "optimization (default = off)")); @@ -233,6 +299,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF, static bool produceCompactUnwindFrame(MachineFunction &MF); static bool needsWinCFI(const MachineFunction &MF); static StackOffset getSVEStackSize(const MachineFunction &MF); +static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF); /// Returns true if a homogeneous prolog or epilog code can be emitted /// for the size optimization. If possible, a frame helper call is injected. @@ -440,137 +507,309 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(I); } -// Convenience function to create a DWARF expression for -// Expr + NumBytes + NumVGScaledBytes * AArch64::VG -static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, - int NumBytes, int NumVGScaledBytes, unsigned VG, - llvm::raw_string_ostream &Comment) { - uint8_t buffer[16]; +void AArch64FrameLowering::emitCalleeSavedGPRLocations( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); - if (NumBytes) { - Expr.push_back(dwarf::DW_OP_consts); - Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); - Expr.push_back((uint8_t)dwarf::DW_OP_plus); - Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); - } + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + if (CSI.empty()) + return; - if (NumVGScaledBytes) { - Expr.push_back((uint8_t)dwarf::DW_OP_consts); - Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); - Expr.push_back((uint8_t)dwarf::DW_OP_bregx); - Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); - Expr.push_back(0); + for (const auto &Info : CSI) { + if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) + continue; - Expr.push_back((uint8_t)dwarf::DW_OP_mul); - Expr.push_back((uint8_t)dwarf::DW_OP_plus); + assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); + unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true); - Comment << (NumVGScaledBytes < 0 ? " - " : " + ") - << std::abs(NumVGScaledBytes) << " * VG"; + int64_t Offset = + MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } -// Creates an MCCFIInstruction: -// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } -MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP( - const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const { - int64_t NumBytes, NumVGScaledBytes; - AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes, - NumVGScaledBytes); +void AArch64FrameLowering::emitCalleeSavedSVELocations( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + if (CSI.empty()) + return; + + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); + + for (const auto &Info : CSI) { + if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)) + continue; + + // Not all unwinders may know about SVE registers, so assume the lowest + // common demoninator. + assert(!Info.isSpilledToReg() && "Spilling to registers not implemented"); + unsigned Reg = Info.getReg(); + if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg)) + continue; + + StackOffset Offset = + StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - + StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI)); - std::string CommentBuffer = "sp"; - llvm::raw_string_ostream Comment(CommentBuffer); + unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } +} - // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG) - SmallString<64> Expr; - Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31)); - Expr.push_back(0); - appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, - TRI.getDwarfRegNum(AArch64::VG, true), Comment); +void AArch64FrameLowering::emitCalleeSavedFrameMoves( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { + emitCalleeSavedGPRLocations(MBB, MBBI); + emitCalleeSavedSVELocations(MBB, MBBI); +} - // Wrap this into DW_CFA_def_cfa. - SmallString<64> DefCfaExpr; - DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); - uint8_t buffer[16]; - DefCfaExpr.append(buffer, - buffer + encodeULEB128(Expr.size(), buffer)); - DefCfaExpr.append(Expr.str()); - return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), - Comment.str()); +static void insertCFISameValue(const MCInstrDesc &Desc, MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, + unsigned DwarfReg) { + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg)); + BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex); } -MCCFIInstruction AArch64FrameLowering::createCfaOffset( - const TargetRegisterInfo &TRI, unsigned Reg, - const StackOffset &OffsetFromDefCFA) const { - int64_t NumBytes, NumVGScaledBytes; - AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( - OffsetFromDefCFA, NumBytes, NumVGScaledBytes); +void AArch64FrameLowering::resetCFIToInitialState( + MachineBasicBlock &MBB) const { - unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); + MachineFunction &MF = *MBB.getParent(); + const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const auto &TRI = + static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo()); + const auto &MFI = *MF.getInfo<AArch64FunctionInfo>(); - // Non-scalable offsets can use DW_CFA_offset directly. - if (!NumVGScaledBytes) - return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); + const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION); + DebugLoc DL; - std::string CommentBuffer; - llvm::raw_string_ostream Comment(CommentBuffer); - Comment << printReg(Reg, &TRI) << " @ cfa"; + // Reset the CFA to `SP + 0`. + MachineBasicBlock::iterator InsertPt = MBB.begin(); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( + nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0)); + BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex); - // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) - SmallString<64> OffsetExpr; - appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, - TRI.getDwarfRegNum(AArch64::VG, true), Comment); + // Flip the RA sign state. + if (MFI.shouldSignReturnAddress()) { + CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex); + } - // Wrap this into DW_CFA_expression - SmallString<64> CfaExpr; - CfaExpr.push_back(dwarf::DW_CFA_expression); - uint8_t buffer[16]; - CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); - CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); - CfaExpr.append(OffsetExpr.str()); + // Shadow call stack uses X18, reset it. + if (needsShadowCallStackPrologueEpilogue(MF)) + insertCFISameValue(CFIDesc, MF, MBB, InsertPt, + TRI.getDwarfRegNum(AArch64::X18, true)); - return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str()); + // Emit .cfi_same_value for callee-saved registers. + const std::vector<CalleeSavedInfo> &CSI = + MF.getFrameInfo().getCalleeSavedInfo(); + for (const auto &Info : CSI) { + unsigned Reg = Info.getReg(); + if (!TRI.regNeedsCFI(Reg, Reg)) + continue; + insertCFISameValue(CFIDesc, MF, MBB, InsertPt, + TRI.getDwarfRegNum(Reg, true)); + } } -void AArch64FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { +static void emitCalleeSavedRestores(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool SVE) { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const TargetRegisterInfo *TRI = STI.getRegisterInfo(); - const TargetInstrInfo *TII = STI.getInstrInfo(); - DebugLoc DL = MBB.findDebugLoc(MBBI); - // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); if (CSI.empty()) return; + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + for (const auto &Info : CSI) { - Register Reg = Info.getReg(); + if (SVE != + (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)) + continue; - // Not all unwinders may know about SVE registers, so assume the lowest - // common demoninator. - unsigned NewReg; - if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg)) - Reg = NewReg; - else + unsigned Reg = Info.getReg(); + if (SVE && + !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg)) continue; - StackOffset Offset; - if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) { - AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - Offset = - StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - - StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); - } else { - Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) - - getOffsetOfLocalArea()); - } - unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore( + nullptr, TRI.getDwarfRegNum(Info.getReg(), true))); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + .setMIFlags(MachineInstr::FrameDestroy); + } +} + +void AArch64FrameLowering::emitCalleeSavedGPRRestores( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { + emitCalleeSavedRestores(MBB, MBBI, false); +} + +void AArch64FrameLowering::emitCalleeSavedSVERestores( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { + emitCalleeSavedRestores(MBB, MBBI, true); +} + +static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { + switch (Reg.id()) { + default: + // The called routine is expected to preserve r19-r28 + // r29 and r30 are used as frame pointer and link register resp. + return 0; + + // GPRs +#define CASE(n) \ + case AArch64::W##n: \ + case AArch64::X##n: \ + return AArch64::X##n + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + CASE(17); + CASE(18); +#undef CASE + + // FPRs +#define CASE(n) \ + case AArch64::B##n: \ + case AArch64::H##n: \ + case AArch64::S##n: \ + case AArch64::D##n: \ + case AArch64::Q##n: \ + return HasSVE ? AArch64::Z##n : AArch64::Q##n + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + CASE(17); + CASE(18); + CASE(19); + CASE(20); + CASE(21); + CASE(22); + CASE(23); + CASE(24); + CASE(25); + CASE(26); + CASE(27); + CASE(28); + CASE(29); + CASE(30); + CASE(31); +#undef CASE + } +} + +void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, + MachineBasicBlock &MBB) const { + // Insertion point. + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + + // Fake a debug loc. + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + + const MachineFunction &MF = *MBB.getParent(); + const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo &TRI = *STI.getRegisterInfo(); + + BitVector GPRsToZero(TRI.getNumRegs()); + BitVector FPRsToZero(TRI.getNumRegs()); + bool HasSVE = STI.hasSVE(); + for (MCRegister Reg : RegsToZero.set_bits()) { + if (TRI.isGeneralPurposeRegister(MF, Reg)) { + // For GPRs, we only care to clear out the 64-bit register. + if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE)) + GPRsToZero.set(XReg); + } else if (AArch64::FPR128RegClass.contains(Reg) || + AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR32RegClass.contains(Reg) || + AArch64::FPR16RegClass.contains(Reg) || + AArch64::FPR8RegClass.contains(Reg)) { + // For FPRs, + if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE)) + FPRsToZero.set(XReg); + } + } + + const AArch64InstrInfo &TII = *STI.getInstrInfo(); + + // Zero out GPRs. + for (MCRegister Reg : GPRsToZero.set_bits()) + BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), Reg).addImm(0); + + // Zero out FP/vector registers. + for (MCRegister Reg : FPRsToZero.set_bits()) + if (HasSVE) + BuildMI(MBB, MBBI, DL, TII.get(AArch64::DUP_ZI_D), Reg) + .addImm(0) + .addImm(0); + else + BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVIv2d_ns), Reg).addImm(0); + + if (HasSVE) { + for (MCRegister PReg : + {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4, + AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9, + AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14, + AArch64::P15}) { + if (RegsToZero[PReg]) + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg); + } } } @@ -881,16 +1120,9 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI, static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc, - bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) { - // Ignore instructions that do not operate on SP, i.e. shadow call stack - // instructions and associated CFI instruction. - while (MBBI->getOpcode() == AArch64::STRXpost || - MBBI->getOpcode() == AArch64::LDRXpre || - MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) { - if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION) - assert(MBBI->getOperand(0).getReg() != AArch64::SP); - ++MBBI; - } + bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, + MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup, + int CFAOffset = 0) { unsigned NewOpc; switch (MBBI->getOpcode()) { default: @@ -949,12 +1181,14 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // If the first store isn't right where we want SP then we can't fold the // update in so create a normal arithmetic instruction instead. + MachineFunction &MF = *MBB.getParent(); if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(CSStackSizeInc), TII, - InProlog ? MachineInstr::FrameSetup - : MachineInstr::FrameDestroy); + StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag, + false, false, nullptr, EmitCFI, + StackOffset::getFixed(CFAOffset)); + return std::prev(MBBI); } @@ -981,8 +1215,15 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // Generate a new SEH code that corresponds to the new instruction. if (NeedsWinCFI) { *HasWinCFI = true; - InsertSEH(*MIB, *TII, - InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy); + InsertSEH(*MIB, *TII, FrameFlag); + } + + if (EmitCFI) { + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(FrameFlag); } return std::prev(MBB.erase(MBBI)); @@ -998,16 +1239,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, return; unsigned Opc = MI.getOpcode(); - - // Ignore instructions that do not operate on SP, i.e. shadow call stack - // instructions and associated CFI instruction. - if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre || - Opc == AArch64::CFI_INSTRUCTION) { - if (Opc != AArch64::CFI_INSTRUCTION) - assert(MI.getOperand(0).getReg() != AArch64::SP); - return; - } - unsigned Scale; switch (Opc) { case AArch64::STPXi: @@ -1049,38 +1280,6 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, } } -static void adaptForLdStOpt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator FirstSPPopI, - MachineBasicBlock::iterator LastPopI) { - // Sometimes (when we restore in the same order as we save), we can end up - // with code like this: - // - // ldp x26, x25, [sp] - // ldp x24, x23, [sp, #16] - // ldp x22, x21, [sp, #32] - // ldp x20, x19, [sp, #48] - // add sp, sp, #64 - // - // In this case, it is always better to put the first ldp at the end, so - // that the load-store optimizer can run and merge the ldp and the add into - // a post-index ldp. - // If we managed to grab the first pop instruction, move it to the end. - if (ReverseCSRRestoreSeq) - MBB.splice(FirstSPPopI, &MBB, LastPopI); - // We should end up with something like this now: - // - // ldp x24, x23, [sp, #16] - // ldp x22, x21, [sp, #32] - // ldp x20, x19, [sp, #48] - // ldp x26, x25, [sp] - // add sp, sp, #64 - // - // and the load-store optimizer can merge the last two instructions into: - // - // ldp x26, x25, [sp], #64 - // -} - static bool isTargetWindows(const MachineFunction &MF) { return MF.getSubtarget<AArch64Subtarget>().isTargetWindows(); } @@ -1099,6 +1298,80 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { } } +static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) { + if (!(llvm::any_of( + MF.getFrameInfo().getCalleeSavedInfo(), + [](const auto &Info) { return Info.getReg() == AArch64::LR; }) && + MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack))) + return false; + + if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18)) + report_fatal_error("Must reserve x18 to use shadow call stack"); + + return true; +} + +static void emitShadowCallStackPrologue(const TargetInstrInfo &TII, + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool NeedsWinCFI, + bool NeedsUnwindInfo) { + // Shadow call stack prolog: str x30, [x18], #8 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost)) + .addReg(AArch64::X18, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::X18) + .addImm(8) + .setMIFlag(MachineInstr::FrameSetup); + + // This instruction also makes x18 live-in to the entry block. + MBB.addLiveIn(AArch64::X18); + + if (NeedsWinCFI) + BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameSetup); + + if (NeedsUnwindInfo) { + // Emit a CFI instruction that causes 8 to be subtracted from the value of + // x18 when unwinding past this frame. + static const char CFIInst[] = { + dwarf::DW_CFA_val_expression, + 18, // register + 2, // length + static_cast<char>(unsigned(dwarf::DW_OP_breg18)), + static_cast<char>(-8) & 0x7f, // addend (sleb128) + }; + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape( + nullptr, StringRef(CFIInst, sizeof(CFIInst)))); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } +} + +static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII, + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL) { + // Shadow call stack epilog: ldr x30, [x18, #-8]! + BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre)) + .addReg(AArch64::X18, RegState::Define) + .addReg(AArch64::LR, RegState::Define) + .addReg(AArch64::X18) + .addImm(-8) + .setMIFlag(MachineInstr::FrameDestroy); + + if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo()) { + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, 18)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameDestroy); + } +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -1109,8 +1382,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - bool needsFrameMoves = - MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool EmitCFI = AFI->needsDwarfUnwindInfo(); bool HasFP = hasFP(MF); bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; @@ -1128,8 +1400,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, DebugLoc DL; const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>(); - if (MFnI.shouldSignReturnAddress()) { + if (needsShadowCallStackPrologueEpilogue(MF)) + emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI, + MFnI.needsDwarfUnwindInfo()); + if (MFnI.shouldSignReturnAddress()) { unsigned PACI; if (MFnI.shouldSignWithBKey()) { BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY)) @@ -1145,12 +1420,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .addReg(AArch64::LR) .addReg(AArch64::SP, RegState::InternalRead); MI.setMIFlag(MachineInstr::FrameSetup); - - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + if (EmitCFI) { + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + if (EmitCFI && MFnI.isMTETagged()) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED)) + .setMIFlag(MachineInstr::FrameSetup); } // We signal the presence of a Swift extended frame to external tools by @@ -1227,7 +1507,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); - if (needsFrameMoves) { + if (EmitCFI) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. @@ -1261,14 +1541,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI, + EmitCFI); NumBytes = 0; } else if (HomPrologEpilog) { // Stack has been already adjusted. NumBytes -= PrologueSaveSize; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI); + MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI, + EmitCFI); NumBytes -= PrologueSaveSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); @@ -1322,8 +1604,27 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, StackOffset::getFixed(FPOffset), TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); } + if (EmitCFI) { + // Define the current CFA rule to use the provided FP. + const int OffsetToFirstCalleeSaveFromFP = + AFI->getCalleeSaveBaseToFrameRecordOffset() - + AFI->getCalleeSavedStackSize(); + Register FramePtr = RegInfo->getFrameRegister(MF); + unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( + nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } } + // Now emit the moves for whatever callee saved regs we have (including FP, + // LR if those are saved). Frame instructions for SVE register are emitted + // later, after the instruction which actually save SVE regs. + if (EmitCFI) + emitCalleeSavedGPRLocations(MBB, MBBI); + if (windowsRequiresStackProbe(MF, NumBytes)) { uint64_t NumWords = NumBytes >> 4; if (NeedsWinCFI) { @@ -1436,14 +1737,21 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } // Allocate space for the callee saves (if any). - emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, - -AllocateBefore, TII, - MachineInstr::FrameSetup); + emitFrameOffset( + MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitCFI && !HasFP && AllocateBefore, + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + + if (EmitCFI) + emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); // Finally allocate remaining SVE stack space. emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, - -AllocateAfter, TII, - MachineInstr::FrameSetup); + -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, + nullptr, EmitCFI && !HasFP && AllocateAfter, + AllocateBefore + StackOffset::getFixed( + (int64_t)MFI.getStackSize() - NumBytes)); // Allocate space for the rest of the frame. if (NumBytes) { @@ -1458,14 +1766,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } // If we're a leaf function, try using the red zone. - if (!canUseRedZone(MF)) + if (!canUseRedZone(MF)) { // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); - + emitFrameOffset( + MBB, MBBI, DL, scratchSPReg, AArch64::SP, + StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, + false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP, + SVEStackSize + + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + } if (NeedsRealignment) { const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); assert(NrBitsToZero > 1); @@ -1532,109 +1843,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MBB.addLiveIn(AArch64::X1); } } - - if (needsFrameMoves) { - // An example of the prologue: - // - // .globl __foo - // .align 2 - // __foo: - // Ltmp0: - // .cfi_startproc - // .cfi_personality 155, ___gxx_personality_v0 - // Leh_func_begin: - // .cfi_lsda 16, Lexception33 - // - // stp xa,bx, [sp, -#offset]! - // ... - // stp x28, x27, [sp, #offset-32] - // stp fp, lr, [sp, #offset-16] - // add fp, sp, #offset - 16 - // sub sp, sp, #1360 - // - // The Stack: - // +-------------------------------------------+ - // 10000 | ........ | ........ | ........ | ........ | - // 10004 | ........ | ........ | ........ | ........ | - // +-------------------------------------------+ - // 10008 | ........ | ........ | ........ | ........ | - // 1000c | ........ | ........ | ........ | ........ | - // +===========================================+ - // 10010 | X28 Register | - // 10014 | X28 Register | - // +-------------------------------------------+ - // 10018 | X27 Register | - // 1001c | X27 Register | - // +===========================================+ - // 10020 | Frame Pointer | - // 10024 | Frame Pointer | - // +-------------------------------------------+ - // 10028 | Link Register | - // 1002c | Link Register | - // +===========================================+ - // 10030 | ........ | ........ | ........ | ........ | - // 10034 | ........ | ........ | ........ | ........ | - // +-------------------------------------------+ - // 10038 | ........ | ........ | ........ | ........ | - // 1003c | ........ | ........ | ........ | ........ | - // +-------------------------------------------+ - // - // [sp] = 10030 :: >>initial value<< - // sp = 10020 :: stp fp, lr, [sp, #-16]! - // fp = sp == 10020 :: mov fp, sp - // [sp] == 10020 :: stp x28, x27, [sp, #-16]! - // sp == 10010 :: >>final value<< - // - // The frame pointer (w29) points to address 10020. If we use an offset of - // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24 - // for w27, and -32 for w28: - // - // Ltmp1: - // .cfi_def_cfa w29, 16 - // Ltmp2: - // .cfi_offset w30, -8 - // Ltmp3: - // .cfi_offset w29, -16 - // Ltmp4: - // .cfi_offset w27, -24 - // Ltmp5: - // .cfi_offset w28, -32 - - if (HasFP) { - const int OffsetToFirstCalleeSaveFromFP = - AFI->getCalleeSaveBaseToFrameRecordOffset() - - AFI->getCalleeSavedStackSize(); - Register FramePtr = RegInfo->getFrameRegister(MF); - - // Define the current CFA rule to use the provided FP. - unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } else { - unsigned CFIIndex; - if (SVEStackSize) { - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - StackOffset TotalSize = - SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); - CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize)); - } else { - // Encode the stack size of the leaf function. - CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); - } - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - } - - // Now emit the moves for whatever callee saved regs we have (including FP, - // LR if those are saved). - emitCalleeSavedFrameMoves(MBB, MBBI); - } } static void InsertReturnAddressAuth(MachineFunction &MF, @@ -1653,7 +1861,8 @@ static void InsertReturnAddressAuth(MachineFunction &MF, // The AUTIASP instruction assembles to a hint instruction before v8.3a so // this instruction can safely used for any v8a architecture. // From v8.3a onwards there are optimised authenticate LR and return - // instructions, namely RETA{A,B}, that can be used instead. + // instructions, namely RETA{A,B}, that can be used instead. In this case the + // DW_CFA_AARCH64_negate_ra_state can't be emitted. if (Subtarget.hasPAuth() && MBBI != MBB.end() && MBBI->getOpcode() == AArch64::RET_ReallyLR) { BuildMI(MBB, MBBI, DL, @@ -1665,6 +1874,12 @@ static void InsertReturnAddressAuth(MachineFunction &MF, MBB, MBBI, DL, TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP)) .setMIFlag(MachineInstr::FrameDestroy); + + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameDestroy); } } @@ -1686,6 +1901,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool NeedsWinCFI = needsWinCFI(MF); + bool EmitCFI = MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(); bool HasWinCFI = false; bool IsFunclet = false; auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); }); @@ -1695,6 +1911,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, IsFunclet = isFuncletReturnInstr(*MBBI); } + auto FinishingTouches = make_scope_exit([&]() { + InsertReturnAddressAuth(MF, MBB); + if (needsShadowCallStackPrologueEpilogue(MF)) + emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL); + if (EmitCFI) + emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator()); + }); + int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -1707,36 +1931,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // How much of the stack used by incoming arguments this function is expected // to restore in this particular epilogue. int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB); - - // The stack frame should be like below, - // - // ---------------------- --- - // | | | - // | BytesInStackArgArea| CalleeArgStackSize - // | (NumReusableBytes) | (of tail call) - // | | --- - // | | | - // ---------------------| --- | - // | | | | - // | CalleeSavedReg | | | - // | (CalleeSavedStackSize)| | | - // | | | | - // ---------------------| | NumBytes - // | | StackSize (StackAdjustUp) - // | LocalStackSize | | | - // | (covering callee | | | - // | args) | | | - // | | | | - // ---------------------- --- --- - // - // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize - // = StackSize + ArgumentPopSize - // - // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps - // it as the 2nd argument of AArch64ISD::TC_RETURN. - - auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); }); - bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); @@ -1771,9 +1965,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. + bool CombineAfterCSRBump = false; if (!CombineSPBump && PrologueSaveSize != 0) { MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); - while (AArch64InstrInfo::isSEHInstruction(*Pop)) + while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION || + AArch64InstrInfo::isSEHInstruction(*Pop)) Pop = std::prev(Pop); // Converting the last ldp to a post-index ldp is valid only if the last // ldp's offset is 0. @@ -1781,15 +1977,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // If the offset is 0 and the AfterCSR pop is not actually trying to // allocate more stack for arguments (in space that an untimely interrupt // may clobber), convert it to a post-index ldp. - if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) + if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) { convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false); - else { + MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI, + MachineInstr::FrameDestroy, PrologueSaveSize); + } else { // If not, make sure to emit an add after the last ldp. // We're doing this by transfering the size to be restored from the // adjustment *before* the CSR pops to the adjustment *after* the CSR // pops. AfterCSRPopSize += PrologueSaveSize; + CombineAfterCSRBump = true; } } @@ -1822,15 +2020,27 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, } if (hasFP(MF) && AFI->hasSwiftAsyncContext()) { - // We need to reset FP to its untagged state on return. Bit 60 is currently - // used to show the presence of an extended frame. - - // BIC x29, x29, #0x1000_0000_0000_0000 - BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri), - AArch64::FP) - .addUse(AArch64::FP) - .addImm(0x10fe) - .setMIFlag(MachineInstr::FrameDestroy); + switch (MF.getTarget().Options.SwiftAsyncFramePointer) { + case SwiftAsyncFramePointerMode::DeploymentBased: + // Avoid the reload as it is GOT relative, and instead fall back to the + // hardcoded value below. This allows a mismatch between the OS and + // application without immediately terminating on the difference. + LLVM_FALLTHROUGH; + case SwiftAsyncFramePointerMode::Always: + // We need to reset FP to its untagged state on return. Bit 60 is + // currently used to show the presence of an extended frame. + + // BIC x29, x29, #0x1000_0000_0000_0000 + BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri), + AArch64::FP) + .addUse(AArch64::FP) + .addImm(0x10fe) + .setMIFlag(MachineInstr::FrameDestroy); + break; + + case SwiftAsyncFramePointerMode::Never: + break; + } } const StackOffset &SVEStackSize = getSVEStackSize(MF); @@ -1838,10 +2048,22 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { assert(!SVEStackSize && "Cannot combine SP bump with SVE"); + + // When we are about to restore the CSRs, the CFA register is SP again. + if (EmitCFI && hasFP(MF)) { + const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo(); + unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes)); + BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameDestroy); + } + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI); + &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes)); if (HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1873,30 +2095,44 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Deallocate the SVE area. if (SVEStackSize) { - if (AFI->isStackRealigned()) { - if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) + // If we have stack realignment or variable sized objects on the stack, + // restore the stack pointer from the frame pointer prior to SVE CSR + // restoration. + if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) { + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { // Set SP to start of SVE callee-save area from which they can // be reloaded. The code below will deallocate the stack space // space by moving FP -> SP. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, StackOffset::getScalable(-CalleeSavedSize), TII, MachineInstr::FrameDestroy); + } } else { if (AFI->getSVECalleeSavedStackSize()) { // Deallocate the non-SVE locals first before we can deallocate (and // restore callee saves) from the SVE area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, - MachineInstr::FrameDestroy); + emitFrameOffset( + MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy, + false, false, nullptr, EmitCFI && !hasFP(MF), + SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize)); NumBytes = 0; } emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - DeallocateBefore, TII, MachineInstr::FrameDestroy); + DeallocateBefore, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !hasFP(MF), + SVEStackSize + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - DeallocateAfter, TII, MachineInstr::FrameDestroy); + DeallocateAfter, TII, MachineInstr::FrameDestroy, false, + false, nullptr, EmitCFI && !hasFP(MF), + DeallocateAfter + + StackOffset::getFixed(NumBytes + PrologueSaveSize)); } + if (EmitCFI) + emitCalleeSavedSVERestores(MBB, RestoreEnd); } if (!hasFP(MF)) { @@ -1906,23 +2142,24 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (RedZone && AfterCSRPopSize == 0) return; + // Pop the local variables off the stack. If there are no callee-saved + // registers, it means we are actually positioned at the terminator and can + // combine stack increment for the locals and the stack increment for + // callee-popped arguments into (possibly) a single instruction and be done. bool NoCalleeSaveRestore = PrologueSaveSize == 0; int64_t StackRestoreBytes = RedZone ? 0 : NumBytes; if (NoCalleeSaveRestore) StackRestoreBytes += AfterCSRPopSize; + emitFrameOffset( + MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(StackRestoreBytes), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI, + StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize)); + // If we were able to combine the local stack pop with the argument pop, // then we're done. - bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0; - - // If we're done after this, make sure to help the load store optimizer. - if (Done) - adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); - - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(StackRestoreBytes), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - if (Done) { + if (NoCalleeSaveRestore || AfterCSRPopSize == 0) { if (HasWinCFI) { BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1948,29 +2185,29 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); + // When we are about to restore the CSRs, the CFA register is SP again. + if (EmitCFI && hasFP(MF)) { + const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo(); + unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize)); + BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameDestroy); + } + // This must be placed after the callee-save restore code because that code // assumes the SP is at the same location as it was after the callee-save save // code in the prologue. if (AfterCSRPopSize) { assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an " "interrupt may have clobbered"); - // Find an insertion point for the first ldp so that it goes before the - // shadow call stack epilog instruction. This ensures that the restore of - // lr from x18 is placed after the restore from sp. - auto FirstSPPopI = MBB.getFirstTerminator(); - while (FirstSPPopI != Begin) { - auto Prev = std::prev(FirstSPPopI); - if (Prev->getOpcode() != AArch64::LDRXpre || - Prev->getOperand(0).getReg() == AArch64::SP) - break; - FirstSPPopI = Prev; - } - adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); - - emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AfterCSRPopSize), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset( + MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy, + false, NeedsWinCFI, &HasWinCFI, EmitCFI, + StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0)); } if (HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -2061,8 +2298,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( // right thing for the emergency spill slot. bool UseFP = false; if (AFI->hasStackFrame() && !isSVE) { - // We shouldn't prefer using the FP when there is an SVE area - // in between the FP and the non-SVE locals/spills. + // We shouldn't prefer using the FP to access fixed-sized stack objects when + // there are scalable (SVE) objects in between the FP and the fixed-sized + // objects. PreferFP &= !SVEStackSize; // Note: Keeping the following as multiple 'if' statements rather than @@ -2083,7 +2321,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( // offsets is smaller than for positive ones. If an offset is available // via the FP and the SP, use whichever is closest. bool FPOffsetFits = !ForSimm || FPOffset >= -256; - PreferFP |= Offset > -FPOffset; + PreferFP |= Offset > -FPOffset && !SVEStackSize; if (MFI.hasVarSizedObjects()) { // If we have variable sized objects, we can use either FP or BP, as the @@ -2270,7 +2508,7 @@ struct RegPairInfo { static void computeCalleeSaveRegisterPairs( MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, - bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { + bool NeedsFrameRecord) { if (CSI.empty()) return; @@ -2349,15 +2587,6 @@ static void computeCalleeSaveRegisterPairs( } } - // If either of the registers to be saved is the lr register, it means that - // we also need to save lr in the shadow call stack. - if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) && - MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) { - if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18)) - report_fatal_error("Must reserve x18 to use shadow call stack"); - NeedShadowCallStackProlog = true; - } - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI // list to come in sorted by frame index so that we can issue the store // pair instructions directly. Assert if we see anything otherwise. @@ -2476,43 +2705,9 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( DebugLoc DL; SmallVector<RegPairInfo, 8> RegPairs; - bool NeedShadowCallStackProlog = false; - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog, hasFP(MF)); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - - if (NeedShadowCallStackProlog) { - // Shadow call stack prolog: str x30, [x18], #8 - BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost)) - .addReg(AArch64::X18, RegState::Define) - .addReg(AArch64::LR) - .addReg(AArch64::X18) - .addImm(8) - .setMIFlag(MachineInstr::FrameSetup); - - if (NeedsWinCFI) - BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop)) - .setMIFlag(MachineInstr::FrameSetup); - - // Emit a CFI instruction that causes 8 to be subtracted from the value of - // x18 when unwinding past this frame. - static const char CFIInst[] = { - dwarf::DW_CFA_val_expression, - 18, // register - 2, // length - static_cast<char>(unsigned(dwarf::DW_OP_breg18)), - static_cast<char>(-8) & 0x7f, // addend (sleb128) - }; - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape( - nullptr, StringRef(CFIInst, sizeof(CFIInst)))); - BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - - // This instruction also makes x18 live-in to the entry block. - MBB.addLiveIn(AArch64::X18); - } + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); + const MachineRegisterInfo &MRI = MF.getRegInfo(); if (homogeneousPrologEpilog(MF)) { auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog)) .setMIFlag(MachineInstr::FrameSetup); @@ -2622,7 +2817,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } bool AArch64FrameLowering::restoreCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -2630,14 +2825,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( SmallVector<RegPairInfo, 8> RegPairs; bool NeedsWinCFI = needsWinCFI(MF); - if (MI != MBB.end()) - DL = MI->getDebugLoc(); + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); - bool NeedShadowCallStackProlog = false; - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog, hasFP(MF)); + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); - auto EmitMI = [&](const RegPairInfo &RPI) { + auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -2694,7 +2887,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( std::swap(Reg1, Reg2); std::swap(FrameIdxReg1, FrameIdxReg2); } - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); if (RPI.isPaired()) { MIB.addReg(Reg2, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( @@ -2711,6 +2904,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); + + return MIB->getIterator(); }; // SVE objects are always restored in reverse order. @@ -2718,31 +2913,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( if (RPI.isScalable()) EmitMI(RPI); - if (ReverseCSRRestoreSeq) { - for (const RegPairInfo &RPI : reverse(RegPairs)) - if (!RPI.isScalable()) - EmitMI(RPI); - } else if (homogeneousPrologEpilog(MF, &MBB)) { - auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog)) + if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) .setMIFlag(MachineInstr::FrameDestroy); for (auto &RPI : RegPairs) { MIB.addReg(RPI.Reg1, RegState::Define); MIB.addReg(RPI.Reg2, RegState::Define); } return true; - } else - for (const RegPairInfo &RPI : RegPairs) - if (!RPI.isScalable()) - EmitMI(RPI); - - if (NeedShadowCallStackProlog) { - // Shadow call stack epilog: ldr x30, [x18, #-8]! - BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre)) - .addReg(AArch64::X18, RegState::Define) - .addReg(AArch64::LR, RegState::Define) - .addReg(AArch64::X18) - .addImm(-8) - .setMIFlag(MachineInstr::FrameDestroy); + } + + if (ReverseCSRRestoreSeq) { + MachineBasicBlock::iterator First = MBB.end(); + for (const RegPairInfo &RPI : reverse(RegPairs)) { + if (RPI.isScalable()) + continue; + MachineBasicBlock::iterator It = EmitMI(RPI); + if (First == MBB.end()) + First = It; + } + if (First != MBB.end()) + MBB.splice(MBBI, &MBB, First); + } else { + for (const RegPairInfo &RPI : RegPairs) { + if (RPI.isScalable()) + continue; + (void)EmitMI(RPI); + } } return true; @@ -2941,6 +3138,15 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( // stack slots for them. MachineFrameInfo &MFI = MF.getFrameInfo(); auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + + bool UsesWinAAPCS = isTargetWindows(MF); + if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) { + int FrameIdx = MFI.CreateStackObject(8, Align(16), true); + AFI->setSwiftAsyncContextFrameIdx(FrameIdx); + if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; + if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; + } + for (auto &CS : CSI) { Register Reg = CS.getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); @@ -2954,7 +3160,8 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; // Grab 8 bytes below FP for the extended asynchronous frame info. - if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) { + if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS && + Reg == AArch64::FP) { FrameIdx = MFI.CreateStackObject(8, Alignment, true); AFI->setSwiftAsyncContextFrameIdx(FrameIdx); if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; @@ -3190,7 +3397,7 @@ public: // instructions. May skip if the replacement is not profitable. May invalidate // the input iterator and replace it with a valid one. void emitCode(MachineBasicBlock::iterator &InsertI, - const AArch64FrameLowering *TFI, bool IsLast); + const AArch64FrameLowering *TFI, bool TryMergeSPUpdate); }; void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { @@ -3329,7 +3536,8 @@ void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE, } void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, - const AArch64FrameLowering *TFI, bool IsLast) { + const AArch64FrameLowering *TFI, + bool TryMergeSPUpdate) { if (TagStores.empty()) return; TagStoreInstr &FirstTagStore = TagStores[0]; @@ -3359,8 +3567,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, emitUnrolled(InsertI); } else { MachineInstr *UpdateInstr = nullptr; - int64_t TotalOffset; - if (IsLast) { + int64_t TotalOffset = 0; + if (TryMergeSPUpdate) { // See if we can merge base register update into the STGloop. // This is done in AArch64LoadStoreOptimizer for "normal" stores, // but STGloop is way too unusual for that, and also it only @@ -3505,7 +3713,7 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, for (auto &Instr : Instrs) { if (EndOffset && *EndOffset != Instr.Offset) { // Found a gap. - TSE.emitCode(InsertI, TFI, /*IsLast = */ false); + TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false); TSE.clear(); } @@ -3513,7 +3721,11 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, EndOffset = Instr.Offset + Instr.Size; } - TSE.emitCode(InsertI, TFI, /*IsLast = */ true); + // Multiple FP/SP updates in a loop cannot be described by CFI instructions. + TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ + !MBB->getParent() + ->getInfo<AArch64FunctionInfo>() + ->needsAsyncDwarfUnwindInfo()); return InsertI; } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 31f57cbc49f2..f59860a24d9b 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -29,6 +29,8 @@ public: void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const; + void resetCFIToInitialState(MachineBasicBlock &MBB) const override; + MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override; @@ -141,13 +143,20 @@ private: int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, int &MinCSFrameIndex, int &MaxCSFrameIndex) const; - MCCFIInstruction - createDefCFAExpressionFromSP(const TargetRegisterInfo &TRI, - const StackOffset &OffsetFromSP) const; - MCCFIInstruction createCfaOffset(const TargetRegisterInfo &MRI, unsigned DwarfReg, - const StackOffset &OffsetFromDefCFA) const; bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, unsigned StackBumpBytes) const; + void emitCalleeSavedGPRLocations(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; + void emitCalleeSavedSVELocations(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; + void emitCalleeSavedGPRRestores(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; + void emitCalleeSavedSVERestores(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; + + /// Emit target zero call-used regs. + void emitZeroCallUsedRegs(BitVector RegsToZero, + MachineBasicBlock &MBB) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 899f069abdd4..82fe5772c99d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -159,6 +159,22 @@ public: return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift); } + bool SelectExtractHigh(SDValue N, SDValue &Res) { + if (Subtarget->isLittleEndian() && N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0); + if (N->getOpcode() != ISD::EXTRACT_SUBVECTOR || + !isa<ConstantSDNode>(N->getOperand(1))) + return false; + EVT VT = N->getValueType(0); + EVT LVT = N->getOperand(0).getValueType(); + unsigned Index = N->getConstantOperandVal(1); + if (!VT.is64BitVector() || !LVT.is128BitVector() || + Index != VT.getVectorNumElements()) + return false; + Res = N->getOperand(0); + return true; + } + bool SelectDupZeroOrUndef(SDValue N) { switch(N->getOpcode()) { case ISD::UNDEF: @@ -204,6 +220,11 @@ public: return SelectSVEAddSubImm(N, VT, Imm, Shift); } + template <MVT::SimpleValueType VT> + bool SelectSVECpyDupImm(SDValue N, SDValue &Imm, SDValue &Shift) { + return SelectSVECpyDupImm(N, VT, Imm, Shift); + } + template <MVT::SimpleValueType VT, bool Invert = false> bool SelectSVELogicalImm(SDValue N, SDValue &Imm) { return SelectSVELogicalImm(N, VT, Imm, Invert); @@ -219,6 +240,16 @@ public: return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm); } + bool SelectSVEShiftSplatImmR(SDValue N, SDValue &Imm) { + if (N->getOpcode() != ISD::SPLAT_VECTOR) + return false; + + EVT EltVT = N->getValueType(0).getVectorElementType(); + return SelectSVEShiftImm(N->getOperand(0), /* Low */ 1, + /* High */ EltVT.getFixedSizeInBits(), + /* AllowSaturation */ true, Imm); + } + // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. template<signed Min, signed Max, signed Scale, bool Shift> bool SelectCntImm(SDValue N, SDValue &Imm) { @@ -257,6 +288,15 @@ public: return false; } + template <unsigned BaseReg> bool ImmToTile(SDValue N, SDValue &Imm) { + if (auto *CI = dyn_cast<ConstantSDNode>(N)) { + uint64_t C = CI->getZExtValue(); + Imm = CurDAG->getRegister(BaseReg + C, MVT::Other); + return true; + } + return false; + } + /// Form sequences of consecutive 64/128-bit registers for use in NEON /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have /// between 1 and 4 elements. If it contains a single element that is returned @@ -300,6 +340,11 @@ public: return SelectSVERegRegAddrMode(N, Scale, Base, Offset); } + template <unsigned Scale> + bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { + return SelectSMETileSlice(N, Scale, Vector, Offset); + } + void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); @@ -357,10 +402,8 @@ private: bool SelectCMP_SWAP(SDNode *N); - bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift); - bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); - + bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm, bool Invert); bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); @@ -370,6 +413,8 @@ private: bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); + bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector, + SDValue &Offset); bool SelectAllActivePredicate(SDValue N); }; @@ -822,9 +867,17 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, Reg = N.getOperand(0); - // Don't match if free 32-bit -> 64-bit zext can be used instead. - if (Ext == AArch64_AM::UXTW && - Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode())) + // Don't match if free 32-bit -> 64-bit zext can be used instead. Use the + // isDef32 as a heuristic for when the operand is likely to be a 32bit def. + auto isDef32 = [](SDValue N) { + unsigned Opc = N.getOpcode(); + return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && + Opc != ISD::CopyFromReg && Opc != ISD::AssertSext && + Opc != ISD::AssertZext && Opc != ISD::AssertAlign && + Opc != ISD::FREEZE; + }; + if (Ext == AArch64_AM::UXTW && Reg->getValueType(0).getSizeInBits() == 32 && + isDef32(Reg)) return false; } @@ -1852,6 +1905,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, VT = Opd0->getValueType(0); } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) { Opd0 = Op0->getOperand(0); + ClampMSB = (VT == MVT::i32); } else if (BiggerPattern) { // Let's pretend a 0 shift right has been performed. // The resulting code will be at least as good as the original one @@ -2710,8 +2764,16 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits, // shift the needed bits into place. SDLoc DL(N); unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri; + uint64_t LsrImm = LSB; + if (Src->hasOneUse() && + isOpcWithIntImmediate(Src.getNode(), ISD::SRL, LsrImm) && + (LsrImm + LSB) < BitWidth) { + Src = Src->getOperand(0); + LsrImm += LSB; + } + SDNode *LSR = CurDAG->getMachineNode( - ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT), + ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LsrImm, DL, VT), CurDAG->getTargetConstant(BitWidth - 1, DL, VT)); // BFXIL is an alias of BFM, so translate to BFM operands. @@ -2827,15 +2889,15 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) { SDValue Add1 = ShiftAmt->getOperand(1); uint64_t Add0Imm; uint64_t Add1Imm; - // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X - // to avoid the ADD/SUB. - if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0)) + if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0)) { + // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X + // to avoid the ADD/SUB. NewShiftAmt = Add0; - // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to - // generate a NEG instead of a SUB of a constant. - else if (ShiftAmt->getOpcode() == ISD::SUB && - isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 && - (Add0Imm % Size == 0)) { + } else if (ShiftAmt->getOpcode() == ISD::SUB && + isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 && + (Add0Imm % Size == 0)) { + // If we are shifting by N-X where N == 0 mod Size, then just shift by -X + // to generate a NEG instead of a SUB from a constant. unsigned NegOpc; unsigned ZeroReg; EVT SubVT = ShiftAmt->getValueType(0); @@ -2852,6 +2914,26 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) { MachineSDNode *Neg = CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1); NewShiftAmt = SDValue(Neg, 0); + } else if (ShiftAmt->getOpcode() == ISD::SUB && + isIntImmediate(Add0, Add0Imm) && (Add0Imm % Size == Size - 1)) { + // If we are shifting by N-X where N == -1 mod Size, then just shift by ~X + // to generate a NOT instead of a SUB from a constant. + unsigned NotOpc; + unsigned ZeroReg; + EVT SubVT = ShiftAmt->getValueType(0); + if (SubVT == MVT::i32) { + NotOpc = AArch64::ORNWrr; + ZeroReg = AArch64::WZR; + } else { + assert(SubVT == MVT::i64); + NotOpc = AArch64::ORNXrr; + ZeroReg = AArch64::XZR; + } + SDValue Zero = + CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT); + MachineSDNode *Not = + CurDAG->getMachineNode(NotOpc, DL, SubVT, Zero, Add1); + NewShiftAmt = SDValue(Not, 0); } else return false; } else { @@ -3108,72 +3190,81 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { return true; } -bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base, - SDValue &Offset) { - auto C = dyn_cast<ConstantSDNode>(N); - if (!C) +bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, + SDValue &Shift) { + if (!isa<ConstantSDNode>(N)) return false; - auto Ty = N->getValueType(0); - - int64_t Imm = C->getSExtValue(); SDLoc DL(N); - - if ((Imm >= -128) && (Imm <= 127)) { - Base = CurDAG->getTargetConstant(Imm, DL, Ty); - Offset = CurDAG->getTargetConstant(0, DL, Ty); - return true; - } - - if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) { - Base = CurDAG->getTargetConstant(Imm/256, DL, Ty); - Offset = CurDAG->getTargetConstant(8, DL, Ty); + uint64_t Val = cast<ConstantSDNode>(N) + ->getAPIntValue() + .trunc(VT.getFixedSizeInBits()) + .getZExtValue(); + + switch (VT.SimpleTy) { + case MVT::i8: + // All immediates are supported. + Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32); return true; + case MVT::i16: + case MVT::i32: + case MVT::i64: + // Support 8bit unsigned immediates. + if (Val <= 255) { + Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val, DL, MVT::i32); + return true; + } + // Support 16bit unsigned immediates that are a multiple of 256. + if (Val <= 65280 && Val % 256 == 0) { + Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val >> 8, DL, MVT::i32); + return true; + } + break; + default: + break; } return false; } -bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) { - if (auto CNode = dyn_cast<ConstantSDNode>(N)) { - const int64_t ImmVal = CNode->getSExtValue(); - SDLoc DL(N); +bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, + SDValue &Shift) { + if (!isa<ConstantSDNode>(N)) + return false; - switch (VT.SimpleTy) { - case MVT::i8: - // Can always select i8s, no shift, mask the immediate value to - // deal with sign-extended value from lowering. + SDLoc DL(N); + int64_t Val = cast<ConstantSDNode>(N) + ->getAPIntValue() + .trunc(VT.getFixedSizeInBits()) + .getSExtValue(); + + switch (VT.SimpleTy) { + case MVT::i8: + // All immediates are supported. + Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32); + return true; + case MVT::i16: + case MVT::i32: + case MVT::i64: + // Support 8bit signed immediates. + if (Val >= -128 && Val <= 127) { Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(ImmVal & 0xFF, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32); + return true; + } + // Support 16bit signed immediates that are a multiple of 256. + if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) { + Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); + Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32); return true; - case MVT::i16: - // i16 values get sign-extended to 32-bits during lowering. - if ((ImmVal & 0xFF) == ImmVal) { - Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); - return true; - } else if ((ImmVal & 0xFF) == 0) { - Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); - Imm = CurDAG->getTargetConstant((ImmVal >> 8) & 0xFF, DL, MVT::i32); - return true; - } - break; - case MVT::i32: - case MVT::i64: - // Range of immediate won't trigger signedness problems for 32/64b. - if ((ImmVal & 0xFF) == ImmVal) { - Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); - return true; - } else if ((ImmVal & 0xFF00) == ImmVal) { - Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32); - return true; - } - break; - default: - break; } + break; + default: + break; } return false; @@ -3901,7 +3992,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { true); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H, true); return; @@ -3922,7 +4013,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { true); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H, true); return; @@ -3943,7 +4034,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { true); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H, true); return; @@ -4267,7 +4358,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -4284,7 +4375,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -4301,7 +4392,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -4911,7 +5002,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -4928,7 +5019,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -4945,7 +5036,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || - (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { + VT == MVT::nxv8bf16) { SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -5033,6 +5124,10 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { const unsigned IntNo = cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::aarch64_sme_ldr || + IntNo == Intrinsic::aarch64_sme_str) + return MVT::nxv16i8; + if (IntNo != Intrinsic::aarch64_sve_prf) return EVT(); @@ -5051,12 +5146,19 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &OffImm) { const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root); const DataLayout &DL = CurDAG->getDataLayout(); + const MachineFrameInfo &MFI = MF->getFrameInfo(); if (N.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); - return true; + // We can only encode VL scaled offsets, so only fold in frame indexes + // referencing SVE objects. + if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) { + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); + return true; + } + + return false; } if (MemVT == EVT()) @@ -5083,7 +5185,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast<FrameIndexSDNode>(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + // We can only encode VL scaled offsets, so only fold in frame indexes + // referencing SVE objects. + if (FI == 0 || MFI.getStackID(FI) == TargetStackID::ScalableVector) + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); } OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); @@ -5149,3 +5254,30 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) { return TLI->isAllActivePredicate(*CurDAG, N); } + +bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale, + SDValue &Base, SDValue &Offset) { + if (N.getOpcode() != ISD::ADD) { + Base = N; + Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64); + return true; + } + + // Process an ADD node. + const SDValue LHS = N.getOperand(0); + const SDValue RHS = N.getOperand(1); + + if (auto C = dyn_cast<ConstantSDNode>(RHS)) { + int64_t ImmOff = C->getSExtValue(); + unsigned MaxSize = (1 << Scale) - 1; + + if (ImmOff < 0 || ImmOff > MaxSize) + return false; + + Base = LHS; + Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c539c8617d99..abfe2d507111 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -208,6 +208,7 @@ static bool isMergePassthruOpcode(unsigned Opc) { case AArch64ISD::BSWAP_MERGE_PASSTHRU: case AArch64ISD::REVH_MERGE_PASSTHRU: case AArch64ISD::REVW_MERGE_PASSTHRU: + case AArch64ISD::REVD_MERGE_PASSTHRU: case AArch64ISD::CTLZ_MERGE_PASSTHRU: case AArch64ISD::CTPOP_MERGE_PASSTHRU: case AArch64ISD::DUP_MERGE_PASSTHRU: @@ -289,8 +290,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8bf16); } - if (Subtarget->hasSVE()) { + if (Subtarget->hasSVE() || Subtarget->hasSME()) { // Add legal sve predicate types + addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); @@ -324,50 +326,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (useSVEForFixedLengthVectorVT(VT)) addRegisterClass(VT, &AArch64::ZPRRegClass); } - - for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { - setOperationAction(ISD::SADDSAT, VT, Legal); - setOperationAction(ISD::UADDSAT, VT, Legal); - setOperationAction(ISD::SSUBSAT, VT, Legal); - setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::UREM, VT, Expand); - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Expand); - setOperationAction(ISD::UDIVREM, VT, Expand); - } - - for (auto VT : - { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, - MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) - setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); - - for (auto VT : - { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32, - MVT::nxv2f64 }) { - setCondCodeAction(ISD::SETO, VT, Expand); - setCondCodeAction(ISD::SETOLT, VT, Expand); - setCondCodeAction(ISD::SETLT, VT, Expand); - setCondCodeAction(ISD::SETOLE, VT, Expand); - setCondCodeAction(ISD::SETLE, VT, Expand); - setCondCodeAction(ISD::SETULT, VT, Expand); - setCondCodeAction(ISD::SETULE, VT, Expand); - setCondCodeAction(ISD::SETUGE, VT, Expand); - setCondCodeAction(ISD::SETUGT, VT, Expand); - setCondCodeAction(ISD::SETUEQ, VT, Expand); - setCondCodeAction(ISD::SETUNE, VT, Expand); - - setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FPOW, VT, Expand); - setOperationAction(ISD::FPOWI, VT, Expand); - setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FSIN, VT, Expand); - setOperationAction(ISD::FSINCOS, VT, Expand); - setOperationAction(ISD::FEXP, VT, Expand); - setOperationAction(ISD::FEXP2, VT, Expand); - setOperationAction(ISD::FLOG, VT, Expand); - setOperationAction(ISD::FLOG2, VT, Expand); - setOperationAction(ISD::FLOG10, VT, Expand); - } } // Compute derived properties from the register classes @@ -389,7 +347,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); - setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); setOperationAction(ISD::BR_CC, MVT::f16, Custom); @@ -448,6 +406,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently + // aren't handled. // Lowering for many of the conversions is actually specified by the non-f128 // type. The LowerXXX function will be trivial when f128 isn't involved. @@ -508,16 +468,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // BlockAddress setOperationAction(ISD::BlockAddress, MVT::i64, Custom); - // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. - setOperationAction(ISD::ADDC, MVT::i32, Custom); - setOperationAction(ISD::ADDE, MVT::i32, Custom); - setOperationAction(ISD::SUBC, MVT::i32, Custom); - setOperationAction(ISD::SUBE, MVT::i32, Custom); - setOperationAction(ISD::ADDC, MVT::i64, Custom); - setOperationAction(ISD::ADDE, MVT::i64, Custom); - setOperationAction(ISD::SUBC, MVT::i64, Custom); - setOperationAction(ISD::SUBE, MVT::i64, Custom); - // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); @@ -568,6 +518,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMULO, MVT::i32, Custom); setOperationAction(ISD::UMULO, MVT::i64, Custom); + setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); + setOperationAction(ISD::ADDCARRY, MVT::i64, Custom); + setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); + setOperationAction(ISD::SUBCARRY, MVT::i64, Custom); + setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom); + setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom); + setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom); + setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom); + setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); @@ -581,64 +540,41 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, else setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FREM, MVT::v4f16, Expand); - setOperationAction(ISD::FREM, MVT::v8f16, Expand); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::v4f16, Expand); - setOperationAction(ISD::FPOW, MVT::v8f16, Expand); - setOperationAction(ISD::FPOWI, MVT::f16, Promote); - setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); - setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::v4f16, Expand); - setOperationAction(ISD::FCOS, MVT::v8f16, Expand); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::v4f16, Expand); - setOperationAction(ISD::FSIN, MVT::v8f16, Expand); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); - setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::v4f16, Expand); - setOperationAction(ISD::FEXP, MVT::v8f16, Expand); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); - setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::v4f16, Expand); - setOperationAction(ISD::FLOG, MVT::v8f16, Expand); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); - setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); - setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); + for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, + ISD::FCOS, ISD::FSIN, ISD::FSINCOS, + ISD::FEXP, ISD::FEXP2, ISD::FLOG, + ISD::FLOG2, ISD::FLOG10, ISD::STRICT_FREM, + ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS, + ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, + ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) { + setOperationAction(Op, MVT::f16, Promote); + setOperationAction(Op, MVT::v4f16, Expand); + setOperationAction(Op, MVT::v8f16, Expand); + } if (!Subtarget->hasFullFP16()) { - setOperationAction(ISD::SELECT, MVT::f16, Promote); - setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); - setOperationAction(ISD::SETCC, MVT::f16, Promote); - setOperationAction(ISD::BR_CC, MVT::f16, Promote); - setOperationAction(ISD::FADD, MVT::f16, Promote); - setOperationAction(ISD::FSUB, MVT::f16, Promote); - setOperationAction(ISD::FMUL, MVT::f16, Promote); - setOperationAction(ISD::FDIV, MVT::f16, Promote); - setOperationAction(ISD::FMA, MVT::f16, Promote); - setOperationAction(ISD::FNEG, MVT::f16, Promote); - setOperationAction(ISD::FABS, MVT::f16, Promote); - setOperationAction(ISD::FCEIL, MVT::f16, Promote); - setOperationAction(ISD::FSQRT, MVT::f16, Promote); - setOperationAction(ISD::FFLOOR, MVT::f16, Promote); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); - setOperationAction(ISD::FRINT, MVT::f16, Promote); - setOperationAction(ISD::FROUND, MVT::f16, Promote); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); - setOperationAction(ISD::FTRUNC, MVT::f16, Promote); - setOperationAction(ISD::FMINNUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); - setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); + for (auto Op : + {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC, + ISD::BR_CC, ISD::FADD, ISD::FSUB, + ISD::FMUL, ISD::FDIV, ISD::FMA, + ISD::FNEG, ISD::FABS, ISD::FCEIL, + ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, + ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, + ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, + ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, + ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, + ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, + ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, + ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, + ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, + ISD::STRICT_FMAXIMUM}) + setOperationAction(Op, MVT::f16, Promote); + + // Round-to-integer need custom lowering for fp16, as Promote doesn't work + // because the result type is integer. + for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, + ISD::STRICT_LLRINT}) + setOperationAction(Op, MVT::f16, Custom); // promote v4f16 to v4f32 when that is known to be safe. setOperationAction(ISD::FADD, MVT::v4f16, Promote); @@ -691,37 +627,35 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } // AArch64 has implementations of a lot of rounding-like FP operations. - for (MVT Ty : {MVT::f32, MVT::f64}) { - setOperationAction(ISD::FFLOOR, Ty, Legal); - setOperationAction(ISD::FNEARBYINT, Ty, Legal); - setOperationAction(ISD::FCEIL, Ty, Legal); - setOperationAction(ISD::FRINT, Ty, Legal); - setOperationAction(ISD::FTRUNC, Ty, Legal); - setOperationAction(ISD::FROUND, Ty, Legal); - setOperationAction(ISD::FROUNDEVEN, Ty, Legal); - setOperationAction(ISD::FMINNUM, Ty, Legal); - setOperationAction(ISD::FMAXNUM, Ty, Legal); - setOperationAction(ISD::FMINIMUM, Ty, Legal); - setOperationAction(ISD::FMAXIMUM, Ty, Legal); - setOperationAction(ISD::LROUND, Ty, Legal); - setOperationAction(ISD::LLROUND, Ty, Legal); - setOperationAction(ISD::LRINT, Ty, Legal); - setOperationAction(ISD::LLRINT, Ty, Legal); - } - - if (Subtarget->hasFullFP16()) { - setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); - setOperationAction(ISD::FFLOOR, MVT::f16, Legal); - setOperationAction(ISD::FCEIL, MVT::f16, Legal); - setOperationAction(ISD::FRINT, MVT::f16, Legal); - setOperationAction(ISD::FTRUNC, MVT::f16, Legal); - setOperationAction(ISD::FROUND, MVT::f16, Legal); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); - setOperationAction(ISD::FMINNUM, MVT::f16, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); - setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); - } + for (auto Op : + {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, + ISD::FRINT, ISD::FTRUNC, ISD::FROUND, + ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM, + ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND, + ISD::LLROUND, ISD::LRINT, ISD::LLRINT, + ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT, + ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, + ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, + ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND, + ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) { + for (MVT Ty : {MVT::f32, MVT::f64}) + setOperationAction(Op, Ty, Legal); + if (Subtarget->hasFullFP16()) + setOperationAction(Op, MVT::f16, Legal); + } + + // Basic strict FP operations are legal + for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, + ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) { + for (MVT Ty : {MVT::f32, MVT::f64}) + setOperationAction(Op, Ty, Legal); + if (Subtarget->hasFullFP16()) + setOperationAction(Op, MVT::f16, Legal); + } + + // Strict conversion to a larger type is legal + for (auto VT : {MVT::f32, MVT::f64}) + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); setOperationAction(ISD::PREFETCH, MVT::Other, Custom); @@ -891,47 +825,33 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. - setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::ABS); - setTargetDAGCombine(ISD::SUB); - setTargetDAGCombine(ISD::XOR); - setTargetDAGCombine(ISD::SINT_TO_FP); - setTargetDAGCombine(ISD::UINT_TO_FP); - - setTargetDAGCombine(ISD::FP_TO_SINT); - setTargetDAGCombine(ISD::FP_TO_UINT); - setTargetDAGCombine(ISD::FP_TO_SINT_SAT); - setTargetDAGCombine(ISD::FP_TO_UINT_SAT); - setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP, + ISD::UINT_TO_FP}); + + setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, + ISD::FP_TO_UINT_SAT, ISD::FDIV}); // Try and combine setcc with csel setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); - setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::VECTOR_SPLICE); - setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::TRUNCATE); - setTargetDAGCombine(ISD::CONCAT_VECTORS); - setTargetDAGCombine(ISD::INSERT_SUBVECTOR); - setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND, + ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, + ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, + ISD::INSERT_SUBVECTOR, ISD::STORE}); if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine({ISD::SELECT, ISD::VSELECT}); + + setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, + ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, + ISD::VECREDUCE_ADD, ISD::STEP_VECTOR}); - setTargetDAGCombine(ISD::INTRINSIC_VOID); - setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::VECREDUCE_ADD); - setTargetDAGCombine(ISD::STEP_VECTOR); + setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER}); setTargetDAGCombine(ISD::FP_EXTEND); @@ -980,43 +900,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: - setOperationAction(ISD::FABS, MVT::v1f64, Expand); - setOperationAction(ISD::FADD, MVT::v1f64, Expand); - setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); - setOperationAction(ISD::FCOS, MVT::v1f64, Expand); - setOperationAction(ISD::FDIV, MVT::v1f64, Expand); - setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); - setOperationAction(ISD::FMA, MVT::v1f64, Expand); - setOperationAction(ISD::FMUL, MVT::v1f64, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); - setOperationAction(ISD::FNEG, MVT::v1f64, Expand); - setOperationAction(ISD::FPOW, MVT::v1f64, Expand); - setOperationAction(ISD::FREM, MVT::v1f64, Expand); - setOperationAction(ISD::FROUND, MVT::v1f64, Expand); - setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand); - setOperationAction(ISD::FRINT, MVT::v1f64, Expand); - setOperationAction(ISD::FSIN, MVT::v1f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); - setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); - setOperationAction(ISD::FSUB, MVT::v1f64, Expand); - setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); - setOperationAction(ISD::SETCC, MVT::v1f64, Expand); - setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); - setOperationAction(ISD::SELECT, MVT::v1f64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); - - setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); - setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); - - setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v1i64, Expand); - setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v1i64, Expand); - - setOperationAction(ISD::MUL, MVT::v1i64, Expand); + for (auto Op : + {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC, + ISD::BR_CC, ISD::FADD, ISD::FSUB, + ISD::FMUL, ISD::FDIV, ISD::FMA, + ISD::FNEG, ISD::FABS, ISD::FCEIL, + ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, + ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, + ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, + ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, + ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, + ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, + ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, + ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, + ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, + ISD::STRICT_FMAXIMUM}) + setOperationAction(Op, MVT::v1f64, Expand); + + for (auto Op : + {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP, + ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL, + ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT, + ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND}) + setOperationAction(Op, MVT::v1i64, Expand); // AArch64 doesn't have a direct vector ->f32 conversion instructions for // elements smaller than i32, so promote the input to i32 first. @@ -1024,14 +930,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); // Similarly, there is no direct i32 -> f64 vector conversion instruction. - setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); // Or, direct i32 -> f16 vector conversion. Set it so custom, so the // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, + ISD::STRICT_UINT_TO_FP}) + for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32}) + setOperationAction(Op, VT, Custom); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom); @@ -1088,6 +992,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32}) { + setOperationAction(ISD::AVGFLOORS, VT, Legal); + setOperationAction(ISD::AVGFLOORU, VT, Legal); + setOperationAction(ISD::AVGCEILS, VT, Legal); + setOperationAction(ISD::AVGCEILU, VT, Legal); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); } @@ -1141,31 +1049,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } // AArch64 has implementations of a lot of rounding-like FP operations. - for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { - setOperationAction(ISD::FFLOOR, Ty, Legal); - setOperationAction(ISD::FNEARBYINT, Ty, Legal); - setOperationAction(ISD::FCEIL, Ty, Legal); - setOperationAction(ISD::FRINT, Ty, Legal); - setOperationAction(ISD::FTRUNC, Ty, Legal); - setOperationAction(ISD::FROUND, Ty, Legal); - setOperationAction(ISD::FROUNDEVEN, Ty, Legal); - } - - if (Subtarget->hasFullFP16()) { - for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { - setOperationAction(ISD::FFLOOR, Ty, Legal); - setOperationAction(ISD::FNEARBYINT, Ty, Legal); - setOperationAction(ISD::FCEIL, Ty, Legal); - setOperationAction(ISD::FRINT, Ty, Legal); - setOperationAction(ISD::FTRUNC, Ty, Legal); - setOperationAction(ISD::FROUND, Ty, Legal); - setOperationAction(ISD::FROUNDEVEN, Ty, Legal); - } + for (auto Op : + {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, + ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR, + ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT, + ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) { + for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) + setOperationAction(Op, Ty, Legal); + if (Subtarget->hasFullFP16()) + for (MVT Ty : {MVT::v4f16, MVT::v8f16}) + setOperationAction(Op, Ty, Legal); } - if (Subtarget->hasSVE()) - setOperationAction(ISD::VSCALE, MVT::i32, Custom); - setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); @@ -1174,6 +1069,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + + // ADDP custom lowering + for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) + setOperationAction(ISD::ADD, VT, Custom); + // FADDP custom lowering + for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) + setOperationAction(ISD::FADD, VT, Custom); + } + + if (Subtarget->hasSME()) { + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); } if (Subtarget->hasSVE()) { @@ -1194,7 +1100,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); @@ -1224,6 +1130,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); + + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); } // Illegal unpacked integer vector types. @@ -1234,10 +1149,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Legalize unpacked bitcasts to REINTERPRET_CAST. for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16, - MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) + MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) setOperationAction(ISD::BITCAST, VT, Custom); - for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) { + for (auto VT : + { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, + MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); + + for (auto VT : + {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); @@ -1269,18 +1190,33 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MSCATTER, VT, Custom); } - for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { - for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) { - // Avoid marking truncating FP stores as legal to prevent the - // DAGCombiner from creating unsupported truncating stores. + // Firstly, exclude all scalable vector extending loads/truncating stores, + // include both integer and floating scalable vector. + for (MVT VT : MVT::scalable_vector_valuetypes()) { + for (MVT InnerVT : MVT::scalable_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); - // SVE does not have floating-point extending loads. setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } + // Then, selectively enable those which we directly support. + setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal); + setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal); + setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal); + setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal); + setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal); + setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal); + for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { + setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal); + setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal); + setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal); + setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal); + setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal); + setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal); + } + // SVE supports truncating stores of 64 and 128-bit vectors setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom); @@ -1295,7 +1231,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); @@ -1326,6 +1262,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + + setCondCodeAction(ISD::SETO, VT, Expand); + setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); + setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETLE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); + setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETUGE, VT, Expand); + setCondCodeAction(ISD::SETUGT, VT, Expand); + setCondCodeAction(ISD::SETUEQ, VT, Expand); + setCondCodeAction(ISD::SETONE, VT, Expand); } for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { @@ -1334,13 +1293,23 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); } - setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + // NEON doesn't support integer divides, but SVE does + for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v1i64, MVT::v2i64}) { + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + } + + // NEON doesn't support 64-bit vector integer muls, but SVE does. + setOperationAction(ISD::MUL, MVT::v1i64, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1367,32 +1336,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MULHS, MVT::v1i64, Custom); setOperationAction(ISD::MULHS, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v1i64, Custom); setOperationAction(ISD::MULHU, MVT::v2i64, Custom); - setOperationAction(ISD::SDIV, MVT::v8i8, Custom); - setOperationAction(ISD::SDIV, MVT::v16i8, Custom); - setOperationAction(ISD::SDIV, MVT::v4i16, Custom); - setOperationAction(ISD::SDIV, MVT::v8i16, Custom); - setOperationAction(ISD::SDIV, MVT::v2i32, Custom); - setOperationAction(ISD::SDIV, MVT::v4i32, Custom); - setOperationAction(ISD::SDIV, MVT::v1i64, Custom); - setOperationAction(ISD::SDIV, MVT::v2i64, Custom); setOperationAction(ISD::SMAX, MVT::v1i64, Custom); setOperationAction(ISD::SMAX, MVT::v2i64, Custom); setOperationAction(ISD::SMIN, MVT::v1i64, Custom); setOperationAction(ISD::SMIN, MVT::v2i64, Custom); - setOperationAction(ISD::UDIV, MVT::v8i8, Custom); - setOperationAction(ISD::UDIV, MVT::v16i8, Custom); - setOperationAction(ISD::UDIV, MVT::v4i16, Custom); - setOperationAction(ISD::UDIV, MVT::v8i16, Custom); - setOperationAction(ISD::UDIV, MVT::v2i32, Custom); - setOperationAction(ISD::UDIV, MVT::v4i32, Custom); - setOperationAction(ISD::UDIV, MVT::v1i64, Custom); - setOperationAction(ISD::UDIV, MVT::v2i64, Custom); setOperationAction(ISD::UMAX, MVT::v1i64, Custom); setOperationAction(ISD::UMAX, MVT::v2i64, Custom); setOperationAction(ISD::UMIN, MVT::v1i64, Custom); @@ -1426,6 +1377,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); + + setOperationAction(ISD::VSCALE, MVT::i32, Custom); } if (Subtarget->hasMOPS() && Subtarget->hasMTE()) { @@ -1434,6 +1387,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); + + IsStrictFPEnabled = true; } void AArch64TargetLowering::addTypeForNEON(MVT VT) { @@ -1490,10 +1445,10 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) { setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT, Custom); - setOperationAction(ISD::FP_TO_UINT, VT, Custom); - setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); - setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); + for (unsigned Opcode : + {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, + ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) + setOperationAction(Opcode, VT, Custom); if (!VT.isFloatingPoint()) setOperationAction(ISD::ABS, VT, Legal); @@ -1503,14 +1458,39 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) { for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); - // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. + // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP + // NEON types. if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::bf16 && (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) for (unsigned Opcode : - {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM}) + {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM, + ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, + ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, + ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, + ISD::STRICT_FSQRT}) setOperationAction(Opcode, VT, Legal); + // Strict fp extend and trunc are legal + if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16) + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); + if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64) + setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); + + // FIXME: We could potentially make use of the vector comparison instructions + // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of + // complications: + // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons, + // so we would need to expand when the condition code doesn't match the + // kind of comparison. + // * Some kinds of comparison require more than one FCMXY instruction so + // would need to be expanded instead. + // * The lowering of the non-strict versions involves target-specific ISD + // nodes so we would likely need to add strict versions of all of them and + // handle them appropriately. + setOperationAction(ISD::STRICT_FSETCC, VT, Expand); + setOperationAction(ISD::STRICT_FSETCCS, VT, Expand); + if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -1526,9 +1506,11 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, if (!Subtarget->hasSVE()) return true; - // We can only support legal predicate result types. + // We can only support legal predicate result types. We can use the SVE + // whilelo instruction for generating fixed-width predicates too. if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && - ResVT != MVT::nxv16i1) + ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 && + ResVT != MVT::v8i1 && ResVT != MVT::v16i1) return true; // The whilelo instruction only works with i32 or i64 scalar inputs. @@ -1559,7 +1541,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setCondCodeAction(ISD::SETUGE, VT, Expand); setCondCodeAction(ISD::SETUGT, VT, Expand); setCondCodeAction(ISD::SETUEQ, VT, Expand); - setCondCodeAction(ISD::SETUNE, VT, Expand); + setCondCodeAction(ISD::SETONE, VT, Expand); } // Mark integer truncating stores/extending loads as having custom lowering @@ -1830,11 +1812,21 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them Known. void AArch64TargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, KnownBits &Known, - const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { + const SDValue Op, KnownBits &Known, const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; + case AArch64ISD::DUP: { + SDValue SrcOp = Op.getOperand(0); + Known = DAG.computeKnownBits(SrcOp, Depth + 1); + if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) { + assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() && + "Expected DUP implicit truncation"); + Known = Known.trunc(Op.getScalarValueSizeInBits()); + } + break; + } case AArch64ISD::CSEL: { KnownBits Known2; Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); @@ -2006,7 +1998,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) MAKE_CASE(AArch64ISD::ABDS_PRED) MAKE_CASE(AArch64ISD::ABDU_PRED) - MAKE_CASE(AArch64ISD::ADD_PRED) MAKE_CASE(AArch64ISD::MUL_PRED) MAKE_CASE(AArch64ISD::MULHS_PRED) MAKE_CASE(AArch64ISD::MULHU_PRED) @@ -2016,7 +2007,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::SMIN_PRED) MAKE_CASE(AArch64ISD::SRA_PRED) MAKE_CASE(AArch64ISD::SRL_PRED) - MAKE_CASE(AArch64ISD::SUB_PRED) MAKE_CASE(AArch64ISD::UDIV_PRED) MAKE_CASE(AArch64ISD::UMAX_PRED) MAKE_CASE(AArch64ISD::UMIN_PRED) @@ -2061,6 +2051,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::DUPLANE16) MAKE_CASE(AArch64ISD::DUPLANE32) MAKE_CASE(AArch64ISD::DUPLANE64) + MAKE_CASE(AArch64ISD::DUPLANE128) MAKE_CASE(AArch64ISD::MOVI) MAKE_CASE(AArch64ISD::MOVIshift) MAKE_CASE(AArch64ISD::MOVIedit) @@ -2108,10 +2099,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FCMLTz) MAKE_CASE(AArch64ISD::SADDV) MAKE_CASE(AArch64ISD::UADDV) - MAKE_CASE(AArch64ISD::SRHADD) - MAKE_CASE(AArch64ISD::URHADD) - MAKE_CASE(AArch64ISD::SHADD) - MAKE_CASE(AArch64ISD::UHADD) MAKE_CASE(AArch64ISD::SDOT) MAKE_CASE(AArch64ISD::UDOT) MAKE_CASE(AArch64ISD::SMINV) @@ -2150,6 +2137,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FMINNMV_PRED) MAKE_CASE(AArch64ISD::FMUL_PRED) MAKE_CASE(AArch64ISD::FSUB_PRED) + MAKE_CASE(AArch64ISD::RDSVL) MAKE_CASE(AArch64ISD::BIC) MAKE_CASE(AArch64ISD::BIT) MAKE_CASE(AArch64ISD::CBZ) @@ -2267,10 +2255,13 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::INDEX_VECTOR) + MAKE_CASE(AArch64ISD::ADDP) + MAKE_CASE(AArch64ISD::SADDLP) MAKE_CASE(AArch64ISD::UADDLP) MAKE_CASE(AArch64ISD::CALL_RVMARKER) MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) @@ -2278,6 +2269,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING) MAKE_CASE(AArch64ISD::MOPS_MEMCOPY) MAKE_CASE(AArch64ISD::MOPS_MEMMOVE) + MAKE_CASE(AArch64ISD::CALL_BTI) } #undef MAKE_CASE return nullptr; @@ -2351,6 +2343,92 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.add(MI.getOperand(1)); // slice index register + MIB.add(MI.getOperand(2)); // slice index offset + MIB.add(MI.getOperand(3)); // pg + MIB.add(MI.getOperand(4)); // base + MIB.add(MI.getOperand(5)); // offset + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); + + MIB.addReg(AArch64::ZA, RegState::Define); + MIB.add(MI.getOperand(0)); // Vector select register + MIB.add(MI.getOperand(1)); // Vector select offset + MIB.add(MI.getOperand(2)); // Base + MIB.add(MI.getOperand(1)); // Offset, same as vector select offset + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // pn + MIB.add(MI.getOperand(2)); // pm + MIB.add(MI.getOperand(3)); // zn + MIB.add(MI.getOperand(4)); // zm + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // Slice index register + MIB.add(MI.getOperand(2)); // Slice index offset + MIB.add(MI.getOperand(3)); // pg + MIB.add(MI.getOperand(4)); // zn + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M)); + MIB.add(MI.getOperand(0)); // Mask + + unsigned Mask = MI.getOperand(0).getImm(); + for (unsigned I = 0; I < 8; I++) { + if (Mask & (1 << I)) + MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine); + } + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2366,9 +2444,14 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case TargetOpcode::STATEPOINT: // STATEPOINT is a pseudo instruction which has no implicit defs/uses // while bl call instruction (where statepoint will be lowered at the end) - // has implicit def. Add this implicit dead def here as a workaround. - MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true, - true, false, true)); + // has implicit def. This def is early-clobber as it will be set at + // the moment of the call and earlier than any use is read. + // Add this implicit dead def here as a workaround. + MI.addOperand(*MI.getMF(), + MachineOperand::CreateReg( + AArch64::LR, /*isDef*/ true, + /*isImp*/ true, /*isKill*/ false, /*isDead*/ true, + /*isUndef*/ false, /*isEarlyClobber*/ true)); LLVM_FALLTHROUGH; case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: @@ -2376,6 +2459,108 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_B: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_H: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_S: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_D: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_Q: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_B: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_H: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_S: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_D: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_Q: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); + case AArch64::LDR_ZA_PSEUDO: + return EmitFill(MI, BB); + case AArch64::BFMOPA_MPPZZ_PSEUDO: + return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB); + case AArch64::BFMOPS_MPPZZ_PSEUDO: + return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB); + case AArch64::FMOPAL_MPPZZ_PSEUDO: + return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB); + case AArch64::FMOPSL_MPPZZ_PSEUDO: + return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB); + case AArch64::FMOPA_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::FMOPS_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::FMOPA_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::FMOPS_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::SMOPA_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::SMOPS_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::UMOPA_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::UMOPS_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::SUMOPA_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::SUMOPS_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::USMOPA_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::USMOPS_MPPZZ_S_PSEUDO: + return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB); + case AArch64::SMOPA_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::SMOPS_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::UMOPA_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::UMOPS_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::SUMOPA_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::SUMOPS_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::USMOPA_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::USMOPS_MPPZZ_D_PSEUDO: + return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_B: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_H: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_S: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_D: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_Q: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_B: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_H: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_S: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_D: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_Q: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI, + BB); + case AArch64::ZERO_M_PSEUDO: + return EmitZero(MI, BB); } } @@ -2596,7 +2781,17 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, bool IsSignaling) { EVT VT = LHS.getValueType(); assert(VT != MVT::f128); - assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented"); + + const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); + + if (VT == MVT::f16 && !FullFP16) { + LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, + {Chain, LHS}); + RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, + {LHS.getValue(1), RHS}); + Chain = RHS.getValue(1); + VT = MVT::f32; + } unsigned Opcode = IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); @@ -2605,8 +2800,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); - const bool FullFP16 = - static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); + const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); if (VT.isFloatingPoint()) { assert(VT != MVT::f128); @@ -2714,8 +2908,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; - const bool FullFP16 = - static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); + const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); if (LHS.getValueType().isFloatingPoint()) { assert(LHS.getValueType() != MVT::f128); @@ -3282,40 +3475,68 @@ SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { return Op; } -static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); +// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C' +// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else +// sets 'C' bit to 0. +static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) { + SDLoc DL(Value); + EVT VT = Value.getValueType(); + SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value; + SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT); + SDValue Cmp = + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1); + return Cmp.getValue(1); +} - // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) +// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0. +// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1. +static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG, + bool Invert) { + assert(Flag.getResNo() == 1); + SDLoc DL(Flag); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS; + SDValue CC = DAG.getConstant(Cond, DL, MVT::i32); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag); +} + +// Value is 1 if 'V' bit of NZCV is 1, else 0 +static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) { + assert(Flag.getResNo() == 1); + SDLoc DL(Flag); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag); +} + +// This lowering is inefficient, but it will get cleaned up by +// `foldOverflowCheck` +static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, + bool IsSigned) { + EVT VT0 = Op.getValue(0).getValueType(); + EVT VT1 = Op.getValue(1).getValueType(); + + if (VT0 != MVT::i32 && VT0 != MVT::i64) return SDValue(); - SDVTList VTs = DAG.getVTList(VT, MVT::i32); + bool InvertCarry = Opcode == AArch64ISD::SBCS; + SDValue OpLHS = Op.getOperand(0); + SDValue OpRHS = Op.getOperand(1); + SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); - unsigned Opc; - bool ExtraOp = false; - switch (Op.getOpcode()) { - default: - llvm_unreachable("Invalid code"); - case ISD::ADDC: - Opc = AArch64ISD::ADDS; - break; - case ISD::SUBC: - Opc = AArch64ISD::SUBS; - break; - case ISD::ADDE: - Opc = AArch64ISD::ADCS; - ExtraOp = true; - break; - case ISD::SUBE: - Opc = AArch64ISD::SBCS; - ExtraOp = true; - break; - } + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList(VT0, VT1); + + SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS, + OpRHS, OpCarryIn); - if (!ExtraOp) - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), - Op.getOperand(2)); + SDValue OutFlag = + IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) + : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); + + return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { @@ -3417,7 +3638,8 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. - EVT InVT = Op.getOperand(0).getValueType(); + bool IsStrict = Op->isStrictFPOpcode(); + EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType(); EVT VT = Op.getValueType(); if (VT.isScalableVector()) { @@ -3437,6 +3659,12 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, !Subtarget->hasFullFP16()) { MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); SDLoc dl(Op); + if (IsStrict) { + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); @@ -3446,6 +3674,13 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, uint64_t InVTSize = InVT.getFixedSizeInBits(); if (VTSize < InVTSize) { SDLoc dl(Op); + if (IsStrict) { + InVT = InVT.changeVectorElementTypeToInteger(); + SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); + return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl); + } SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), Op.getOperand(0)); @@ -3457,10 +3692,30 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, MVT ExtVT = MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), VT.getVectorNumElements()); + if (IsStrict) { + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } + // Use a scalar operation for conversions between single-element vectors of + // the same size. + if (NumElts == 1) { + SDLoc dl(Op); + SDValue Extract = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), + Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64)); + EVT ScalarVT = VT.getScalarType(); + if (IsStrict) + return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, + {Op.getOperand(0), Extract}); + return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); + } + // Type changing conversions are illegal. return Op; } @@ -3475,8 +3730,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, // f16 conversions are promoted to f32 when full fp16 is not supported. if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { - assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); SDLoc dl(Op); + if (IsStrict) { + SDValue Ext = + DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, + {Op.getOperand(0), SrcVal}); + return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); @@ -3507,7 +3768,7 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op, "Saturation width cannot exceed result width"); // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT. - // Currently, the `llvm.fpto[su]i.sat.*` instrinsics don't accept scalable + // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable // types, so this is hard to reach. if (DstVT.isScalableVector()) return SDValue(); @@ -3545,17 +3806,14 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op, SDValue Sat; if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { SDValue MinC = DAG.getConstant( - APInt::getSignedMaxValue(SatWidth).sextOrSelf(SrcElementWidth), DL, - IntVT); + APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT); SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC); SDValue MaxC = DAG.getConstant( - APInt::getSignedMinValue(SatWidth).sextOrSelf(SrcElementWidth), DL, - IntVT); + APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT); Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC); } else { SDValue MinC = DAG.getConstant( - APInt::getAllOnesValue(SatWidth).zextOrSelf(SrcElementWidth), DL, - IntVT); + APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT); Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC); } @@ -3604,14 +3862,14 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SDValue Sat; if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { SDValue MinC = DAG.getConstant( - APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT); + APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT); SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC); SDValue MaxC = DAG.getConstant( - APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth), DL, DstVT); + APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT); Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC); } else { SDValue MinC = DAG.getConstant( - APInt::getAllOnesValue(SatWidth).zextOrSelf(DstWidth), DL, DstVT); + APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT); Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC); } @@ -3623,9 +3881,10 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. + bool IsStrict = Op->isStrictFPOpcode(); EVT VT = Op.getValueType(); SDLoc dl(Op); - SDValue In = Op.getOperand(0); + SDValue In = Op.getOperand(IsStrict ? 1 : 0); EVT InVT = In.getValueType(); unsigned Opc = Op.getOpcode(); bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; @@ -3653,6 +3912,13 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, MVT CastVT = MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); + if (IsStrict) { + In = DAG.getNode(Opc, dl, {CastVT, MVT::Other}, + {Op.getOperand(0), In}); + return DAG.getNode( + ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, + {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)}); + } In = DAG.getNode(Opc, dl, CastVT, In); return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); } @@ -3661,9 +3927,24 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; EVT CastVT = VT.changeVectorElementTypeToInteger(); In = DAG.getNode(CastOpc, dl, CastVT, In); + if (IsStrict) + return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In}); return DAG.getNode(Opc, dl, VT, In); } + // Use a scalar operation for conversions between single-element vectors of + // the same size. + if (VT.getVectorNumElements() == 1) { + SDValue Extract = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), + In, DAG.getConstant(0, dl, MVT::i64)); + EVT ScalarVT = VT.getScalarType(); + if (IsStrict) + return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, + {Op.getOperand(0), Extract}); + return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); + } + return Op; } @@ -3676,10 +3957,15 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); // f16 conversions are promoted to f32 when full fp16 is not supported. - if (Op.getValueType() == MVT::f16 && - !Subtarget->hasFullFP16()) { - assert(!IsStrict && "Lowering of strict fp16 not yet implemented"); + if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { SDLoc dl(Op); + if (IsStrict) { + SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other}, + {Op.getOperand(0), SrcVal}); + return DAG.getNode( + ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other}, + {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)}); + } return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), @@ -3742,6 +4028,14 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, return LowerFixedLengthBitcastToSVE(Op, DAG); if (OpVT.isScalableVector()) { + // Bitcasting between unpacked vector types of different element counts is + // not a NOP because the live elements are laid out differently. + // 01234567 + // e.g. nxv2i32 = XX??XX?? + // nxv4f16 = X?X?X?X? + if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount()) + return SDValue(); + if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && "Expected int->fp bitcast!"); @@ -3964,7 +4258,7 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. @@ -4059,10 +4353,26 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) { case AArch64ISD::SETCC_MERGE_ZERO: return Reinterpret; case ISD::INTRINSIC_WO_CHAIN: - if (InOp.getConstantOperandVal(0) == Intrinsic::aarch64_sve_ptrue) + switch (InOp.getConstantOperandVal(0)) { + case Intrinsic::aarch64_sve_ptrue: + case Intrinsic::aarch64_sve_cmpeq_wide: + case Intrinsic::aarch64_sve_cmpne_wide: + case Intrinsic::aarch64_sve_cmpge_wide: + case Intrinsic::aarch64_sve_cmpgt_wide: + case Intrinsic::aarch64_sve_cmplt_wide: + case Intrinsic::aarch64_sve_cmple_wide: + case Intrinsic::aarch64_sve_cmphs_wide: + case Intrinsic::aarch64_sve_cmphi_wide: + case Intrinsic::aarch64_sve_cmplo_wide: + case Intrinsic::aarch64_sve_cmpls_wide: return Reinterpret; + } } + // Splat vectors of one will generate ptrue instructions + if (ISD::isConstantSplatVectorAllOnes(InOp.getNode())) + return Reinterpret; + // Otherwise, zero the newly introduced lanes. SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all); SDValue MaskReinterpret = @@ -4073,12 +4383,12 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); + SDLoc DL(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::aarch64_mops_memset_tag: { auto Node = cast<MemIntrinsicSDNode>(Op.getNode()); - SDLoc DL(Op); SDValue Chain = Node->getChain(); SDValue Dst = Op.getOperand(2); SDValue Val = Op.getOperand(3); @@ -4100,6 +4410,15 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, // changed. return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL); } + case Intrinsic::aarch64_sme_get_pstatesm: { + SDValue Chain = Op.getOperand(0); + SDValue MRS = DAG.getNode( + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other), + Chain, DAG.getConstant(AArch64SysReg::SVCR, DL, MVT::i64)); + SDValue Mask = DAG.getConstant(/* PSTATE.SM */ 1, DL, MVT::i64); + SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, MRS, Mask); + return DAG.getMergeValues({And, Chain}, DL); + } } } @@ -4196,6 +4515,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_clz: return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sme_cntsb: + return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), + DAG.getConstant(1, dl, MVT::i32)); + case Intrinsic::aarch64_sme_cntsh: { + SDValue One = DAG.getConstant(1, dl, MVT::i32); + SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One); + return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One); + } + case Intrinsic::aarch64_sme_cntsw: { + SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), + DAG.getConstant(1, dl, MVT::i32)); + return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, + DAG.getConstant(2, dl, MVT::i32)); + } + case Intrinsic::aarch64_sme_cntsd: { + SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), + DAG.getConstant(1, dl, MVT::i32)); + return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, + DAG.getConstant(3, dl, MVT::i32)); + } case Intrinsic::aarch64_sve_cnt: { SDValue Data = Op.getOperand(3); // CTPOP only supports integer operands. @@ -4300,6 +4639,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_revw: return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_revd: + return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_sxtb: return DAG.getNode( AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), @@ -4336,7 +4678,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2), Op.getOperand(3), DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), Op.getOperand(1)); - case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); const auto *RegInfo = Subtarget->getRegisterInfo(); @@ -4382,9 +4723,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, IntNo == Intrinsic::aarch64_neon_shadd); bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || IntNo == Intrinsic::aarch64_neon_urhadd); - unsigned Opcode = - IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD) - : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD); + unsigned Opcode = IsSignedAdd + ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS) + : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU); return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } @@ -4395,8 +4736,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::aarch64_neon_saddlp: case Intrinsic::aarch64_neon_uaddlp: { - unsigned Opcode = AArch64ISD::UADDLP; + unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp + ? AArch64ISD::UADDLP + : AArch64ISD::SADDLP; return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); } case Intrinsic::aarch64_neon_sdot: @@ -4428,19 +4772,26 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { return false; } -bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { - if (VT.getVectorElementType() == MVT::i32 && - VT.getVectorElementCount().getKnownMinValue() >= 4 && - !VT.isFixedLengthVector()) - return true; +bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT, + EVT DataVT) const { + // SVE only supports implicit extension of 32-bit indices. + if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32) + return false; - return false; + // Indices cannot be smaller than the main data type. + if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits()) + return false; + + // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit + // element container type, which would violate the previous clause. + return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2; } bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return ExtVal.getValueType().isScalableVector() || - useSVEForFixedLengthVectorVT(ExtVal.getValueType(), - /*OverrideNEON=*/true); + useSVEForFixedLengthVectorVT( + ExtVal.getValueType(), + /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()); } unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { @@ -4466,29 +4817,6 @@ unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { return AddrModes.find(Key)->second; } -unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { - std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { - {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), - AArch64ISD::SST1_PRED}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), - AArch64ISD::SST1_UXTW_PRED}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), - AArch64ISD::SST1_PRED}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), - AArch64ISD::SST1_SXTW_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), - AArch64ISD::SST1_SCALED_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), - AArch64ISD::SST1_UXTW_SCALED_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), - AArch64ISD::SST1_SCALED_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), - AArch64ISD::SST1_SXTW_SCALED_PRED}, - }; - auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); - return AddrModes.find(Key)->second; -} - unsigned getSignExtendedGatherOpcode(unsigned Opcode) { switch (Opcode) { default: @@ -4511,267 +4839,184 @@ unsigned getSignExtendedGatherOpcode(unsigned Opcode) { } } -bool getGatherScatterIndexIsExtended(SDValue Index) { - unsigned Opcode = Index.getOpcode(); - if (Opcode == ISD::SIGN_EXTEND_INREG) - return true; - - if (Opcode == ISD::AND) { - SDValue Splat = Index.getOperand(1); - if (Splat.getOpcode() != ISD::SPLAT_VECTOR) - return false; - ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0)); - if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF) - return false; - return true; - } - - return false; -} - -// If the base pointer of a masked gather or scatter is null, we -// may be able to swap BasePtr & Index and use the vector + register -// or vector + immediate addressing mode, e.g. -// VECTOR + REGISTER: -// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices) -// -> getelementptr %offset, <vscale x N x T> %indices -// VECTOR + IMMEDIATE: -// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices) -// -> getelementptr #x, <vscale x N x T> %indices -void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT, - unsigned &Opcode, bool IsGather, - SelectionDAG &DAG) { - if (!isNullConstant(BasePtr)) - return; - - // FIXME: This will not match for fixed vector type codegen as the nodes in - // question will have fixed<->scalable conversions around them. This should be - // moved to a DAG combine or complex pattern so that is executes after all of - // the fixed vector insert and extracts have been removed. This deficiency - // will result in a sub-optimal addressing mode being used, i.e. an ADD not - // being folded into the scatter/gather. - ConstantSDNode *Offset = nullptr; - if (Index.getOpcode() == ISD::ADD) - if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) { - if (isa<ConstantSDNode>(SplatVal)) - Offset = cast<ConstantSDNode>(SplatVal); - else { - BasePtr = SplatVal; - Index = Index->getOperand(0); - return; - } - } - - unsigned NewOp = - IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED; - - if (!Offset) { - std::swap(BasePtr, Index); - Opcode = NewOp; - return; - } - - uint64_t OffsetVal = Offset->getZExtValue(); - unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8; - auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64); - - if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) { - // Index is out of range for the immediate addressing mode - BasePtr = ConstOffset; - Index = Index->getOperand(0); - return; - } - - // Immediate is in range - Opcode = NewOp; - BasePtr = Index->getOperand(0); - Index = ConstOffset; -} - SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); - assert(MGT && "Can only custom lower gather load nodes"); - - bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector(); - SDValue Index = MGT->getIndex(); + SDLoc DL(Op); SDValue Chain = MGT->getChain(); SDValue PassThru = MGT->getPassThru(); SDValue Mask = MGT->getMask(); SDValue BasePtr = MGT->getBasePtr(); - ISD::LoadExtType ExtTy = MGT->getExtensionType(); - - ISD::MemIndexType IndexType = MGT->getIndexType(); - bool IsScaled = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; - bool IsSigned = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; - bool IdxNeedsExtend = - getGatherScatterIndexIsExtended(Index) || - Index.getSimpleValueType().getVectorElementType() == MVT::i32; - bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD; - - EVT VT = PassThru.getSimpleValueType(); - EVT IndexVT = Index.getSimpleValueType(); + SDValue Index = MGT->getIndex(); + SDValue Scale = MGT->getScale(); + EVT VT = Op.getValueType(); EVT MemVT = MGT->getMemoryVT(); - SDValue InputVT = DAG.getValueType(MemVT); - - if (VT.getVectorElementType() == MVT::bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); + ISD::LoadExtType ExtType = MGT->getExtensionType(); + ISD::MemIndexType IndexType = MGT->getIndexType(); - if (IsFixedLength) { + // SVE supports zero (and so undef) passthrough values only, everything else + // must be handled manually by an explicit select on the load's output. + if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) { + SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale}; + SDValue Load = + DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, + MGT->getMemOperand(), IndexType, ExtType); + SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru); + return DAG.getMergeValues({Select, Load.getValue(1)}, DL); + } + + bool IsScaled = MGT->isIndexScaled(); + bool IsSigned = MGT->isIndexSigned(); + + // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else + // must be calculated before hand. + uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); + if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { + assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); + EVT IndexVT = Index.getValueType(); + Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, + DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); + Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); + + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, + MGT->getMemOperand(), IndexType, ExtType); + } + + // Lower fixed length gather to a scalable equivalent. + if (VT.isFixedLengthVector()) { assert(Subtarget->useSVEForFixedLengthVectors() && - "Cannot lower when not using SVE for fixed vectors"); - if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) { - IndexVT = getContainerForFixedLengthVector(DAG, IndexVT); - MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType()); - } else { - MemVT = getContainerForFixedLengthVector(DAG, MemVT); - IndexVT = MemVT.changeTypeToInteger(); - } - InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); - Mask = DAG.getNode( - ISD::SIGN_EXTEND, DL, - VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); - } - - if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) - PassThru = SDValue(); - - if (VT.isFloatingPoint() && !IsFixedLength) { - // Handle FP data by using an integer gather and casting the result. - if (PassThru) { - EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount()); - PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG); - } - InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); - } - - SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other); - - if (getGatherScatterIndexIsExtended(Index)) - Index = Index.getOperand(0); - - unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend); - selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, - /*isGather=*/true, DAG); - - if (ResNeedsSignExtend) - Opcode = getSignExtendedGatherOpcode(Opcode); - - if (IsFixedLength) { - if (Index.getSimpleValueType().isFixedLengthVector()) - Index = convertToScalableVector(DAG, IndexVT, Index); - if (BasePtr.getSimpleValueType().isFixedLengthVector()) - BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr); + "Cannot lower when not using SVE for fixed vectors!"); + + // NOTE: Handle floating-point as if integer then bitcast the result. + EVT DataVT = VT.changeVectorElementTypeToInteger(); + MemVT = MemVT.changeVectorElementTypeToInteger(); + + // Find the smallest integer fixed length vector we can use for the gather. + EVT PromotedVT = VT.changeVectorElementType(MVT::i32); + if (DataVT.getVectorElementType() == MVT::i64 || + Index.getValueType().getVectorElementType() == MVT::i64 || + Mask.getValueType().getVectorElementType() == MVT::i64) + PromotedVT = VT.changeVectorElementType(MVT::i64); + + // Promote vector operands except for passthrough, which we know is either + // undef or zero, and thus best constructed directly. + unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); + Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); + + // A promoted result type forces the need for an extending load. + if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD) + ExtType = ISD::EXTLOAD; + + EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); + + // Convert fixed length vector operands to scalable. + MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); + Index = convertToScalableVector(DAG, ContainerVT, Index); Mask = convertFixedMaskToScalableVector(Mask, DAG); - } - - SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT}; - SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops); - Chain = Result.getValue(1); - - if (IsFixedLength) { - Result = convertFromScalableVector( - DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()), - Result); - Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result); - Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); - - if (PassThru) - Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru); - } else { - if (PassThru) - Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru); - + PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT) + : DAG.getConstant(0, DL, ContainerVT); + + // Emit equivalent scalable vector gather. + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + SDValue Load = + DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL, + Ops, MGT->getMemOperand(), IndexType, ExtType); + + // Extract fixed length data then convert to the required result type. + SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load); + Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result); if (VT.isFloatingPoint()) - Result = getSVESafeBitCast(VT, Result, DAG); + Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); + + return DAG.getMergeValues({Result, Load.getValue(1)}, DL); } - return DAG.getMergeValues({Result, Chain}, DL); + // Everything else is legal. + return Op; } SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); - assert(MSC && "Can only custom lower scatter store nodes"); - bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector(); - - SDValue Index = MSC->getIndex(); + SDLoc DL(Op); SDValue Chain = MSC->getChain(); SDValue StoreVal = MSC->getValue(); SDValue Mask = MSC->getMask(); SDValue BasePtr = MSC->getBasePtr(); - - ISD::MemIndexType IndexType = MSC->getIndexType(); - bool IsScaled = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; - bool IsSigned = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; - bool NeedsExtend = - getGatherScatterIndexIsExtended(Index) || - Index.getSimpleValueType().getVectorElementType() == MVT::i32; - - EVT VT = StoreVal.getSimpleValueType(); - EVT IndexVT = Index.getSimpleValueType(); - SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Index = MSC->getIndex(); + SDValue Scale = MSC->getScale(); + EVT VT = StoreVal.getValueType(); EVT MemVT = MSC->getMemoryVT(); - SDValue InputVT = DAG.getValueType(MemVT); + ISD::MemIndexType IndexType = MSC->getIndexType(); + bool Truncating = MSC->isTruncatingStore(); - if (VT.getVectorElementType() == MVT::bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); + bool IsScaled = MSC->isIndexScaled(); + bool IsSigned = MSC->isIndexSigned(); + + // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else + // must be calculated before hand. + uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); + if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { + assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); + EVT IndexVT = Index.getValueType(); + Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, + DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); + Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); - if (IsFixedLength) { + SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, + MSC->getMemOperand(), IndexType, Truncating); + } + + // Lower fixed length scatter to a scalable equivalent. + if (VT.isFixedLengthVector()) { assert(Subtarget->useSVEForFixedLengthVectors() && - "Cannot lower when not using SVE for fixed vectors"); - if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) { - IndexVT = getContainerForFixedLengthVector(DAG, IndexVT); - MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType()); - } else { - MemVT = getContainerForFixedLengthVector(DAG, MemVT); - IndexVT = MemVT.changeTypeToInteger(); + "Cannot lower when not using SVE for fixed vectors!"); + + // Once bitcast we treat floating-point scatters as if integer. + if (VT.isFloatingPoint()) { + VT = VT.changeVectorElementTypeToInteger(); + MemVT = MemVT.changeVectorElementTypeToInteger(); + StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal); } - InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); - - StoreVal = - DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal); - StoreVal = DAG.getNode( - ISD::ANY_EXTEND, DL, - VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal); - StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal); - Mask = DAG.getNode( - ISD::SIGN_EXTEND, DL, - VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); - } else if (VT.isFloatingPoint()) { - // Handle FP data by casting the data so an integer scatter can be used. - EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount()); - StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG); - InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); - } - - if (getGatherScatterIndexIsExtended(Index)) - Index = Index.getOperand(0); - - unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend); - selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, - /*isGather=*/false, DAG); - - if (IsFixedLength) { - if (Index.getSimpleValueType().isFixedLengthVector()) - Index = convertToScalableVector(DAG, IndexVT, Index); - if (BasePtr.getSimpleValueType().isFixedLengthVector()) - BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr); + + // Find the smallest integer fixed length vector we can use for the scatter. + EVT PromotedVT = VT.changeVectorElementType(MVT::i32); + if (VT.getVectorElementType() == MVT::i64 || + Index.getValueType().getVectorElementType() == MVT::i64 || + Mask.getValueType().getVectorElementType() == MVT::i64) + PromotedVT = VT.changeVectorElementType(MVT::i64); + + // Promote vector operands. + unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); + Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); + StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal); + + // A promoted value type forces the need for a truncating store. + if (PromotedVT != VT) + Truncating = true; + + EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); + + // Convert fixed length vector operands to scalable. + MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); + Index = convertToScalableVector(DAG, ContainerVT, Index); Mask = convertFixedMaskToScalableVector(Mask, DAG); + StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal); + + // Emit equivalent scalable vector scatter. + SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, + MSC->getMemOperand(), IndexType, Truncating); } - SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT}; - return DAG.getNode(Opcode, DL, VTs, Ops); + // Everything else is legal. + return Op; } SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { @@ -4780,7 +5025,9 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { assert(LoadNode && "Expected custom lowering of a masked load node"); EVT VT = Op->getValueType(0); - if (useSVEForFixedLengthVectorVT(VT, true)) + if (useSVEForFixedLengthVectorVT( + VT, + /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) return LowerFixedLengthVectorMLoadToSVE(Op, DAG); SDValue PassThru = LoadNode->getPassThru(); @@ -4847,7 +5094,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, EVT MemVT = StoreNode->getMemoryVT(); if (VT.isVector()) { - if (useSVEForFixedLengthVectorVT(VT, true)) + if (useSVEForFixedLengthVectorVT( + VT, + /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) return LowerFixedLengthVectorStoreToSVE(Op, DAG); unsigned AS = StoreNode->getAddressSpace(); @@ -5007,6 +5256,22 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { Cmp.getValue(1)); } +static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Dest = Op.getOperand(2); + + AArch64CC::CondCode CC; + if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) { + SDLoc dl(Op); + SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Cmp); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -5026,6 +5291,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); + case ISD::BRCOND: + return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::SELECT: @@ -5046,11 +5313,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerVACOPY(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); - case ISD::ADDC: - case ISD::ADDE: - case ISD::SUBC: - case ISD::SUBE: - return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::ADDCARRY: + return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/); + case ISD::SUBCARRY: + return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/); + case ISD::SADDO_CARRY: + return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/); + case ISD::SSUBO_CARRY: + return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: @@ -5165,11 +5435,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::MUL: return LowerMUL(Op, DAG); case ISD::MULHS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); case ISD::MULHU: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: @@ -5234,11 +5502,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFixedLengthVectorLoadToSVE(Op, DAG); return LowerLOAD(Op, DAG); case ISD::ADD: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); case ISD::AND: - return LowerToScalableOp(Op, DAG); case ISD::SUB: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED); + return LowerToScalableOp(Op, DAG); case ISD::FMAXIMUM: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); case ISD::FMAXNUM: @@ -5260,12 +5526,23 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::BSWAP: return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); case ISD::CTLZ: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::VECTOR_SPLICE: return LowerVECTOR_SPLICE(Op, DAG); + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: { + assert(Op.getOperand(1).getValueType() == MVT::f16 && + "Expected custom lowering of rounding operations only for f16"); + SDLoc DL(Op); + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } } } @@ -5275,10 +5552,7 @@ bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( EVT VT, bool OverrideNEON) const { - if (!Subtarget->useSVEForFixedLengthVectors()) - return false; - - if (!VT.isFixedLengthVector()) + if (!VT.isFixedLengthVector() || !VT.isSimple()) return false; // Don't use SVE for vectors we cannot scalarize if required. @@ -5300,12 +5574,16 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( // All SVE implementations support NEON sized vectors. if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) - return true; + return Subtarget->hasSVE(); // Ensure NEON MVTs only belong to a single register class. if (VT.getFixedSizeInBits() <= 128) return false; + // Ensure wider than NEON code generation is enabled. + if (!Subtarget->useSVEForFixedLengthVectors()) + return false; + // Don't use SVE for types that don't fit. if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) return false; @@ -5322,6 +5600,36 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( // Calling Convention Implementation //===----------------------------------------------------------------------===// +static unsigned getIntrinsicID(const SDNode *N) { + unsigned Opcode = N->getOpcode(); + switch (Opcode) { + default: + return Intrinsic::not_intrinsic; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + if (IID < Intrinsic::num_intrinsics) + return IID; + return Intrinsic::not_intrinsic; + } + } +} + +bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + if (!N0.hasOneUse()) + return false; + + unsigned IID = getIntrinsicID(N1.getNode()); + // Avoid reassociating expressions that can be lowered to smlal/umlal. + if (IID == Intrinsic::aarch64_neon_umull || + N1.getOpcode() == AArch64ISD::UMULL || + IID == Intrinsic::aarch64_neon_smull || + N1.getOpcode() == AArch64ISD::SMULL) + return N0.getOpcode() != ISD::ADD; + + return true; +} + /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { @@ -5368,8 +5676,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments( const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); + const Function &F = MF.getFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); + bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv()); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs, + DAG.getTargetLoweringInfo(), MF.getDataLayout()); + if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); })) + FuncInfo->setIsSVECC(true); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; @@ -5383,7 +5699,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // we use a special version of AnalyzeFormalArguments to pass in ValVT and // LocVT. unsigned NumArgs = Ins.size(); - Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); + Function::const_arg_iterator CurOrigArg = F.arg_begin(); unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; @@ -5454,11 +5770,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments( else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; else if (RegVT.isScalableVector() && - RegVT.getVectorElementType() == MVT::i1) + RegVT.getVectorElementType() == MVT::i1) { + FuncInfo->setIsSVECC(true); RC = &AArch64::PPRRegClass; - else if (RegVT.isScalableVector()) + } else if (RegVT.isScalableVector()) { + FuncInfo->setIsSVECC(true); RC = &AArch64::ZPRRegClass; - else + } else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. @@ -5580,7 +5898,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // i1 arguments are zero-extended to i8 by the caller. Emit a // hint to reflect this. if (Ins[i].isOrigArg()) { - Argument *OrigArg = MF.getFunction().getArg(Ins[i].getOrigArgIndex()); + Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex()); if (OrigArg->getType()->isIntegerTy(1)) { if (!Ins[i].Flags.isZExt()) { ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL, @@ -5595,7 +5913,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments( assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); // varargs - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); if (isVarArg) { if (!Subtarget->isTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic @@ -5843,14 +6160,62 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { } } +static void analyzeCallOperands(const AArch64TargetLowering &TLI, + const AArch64Subtarget *Subtarget, + const TargetLowering::CallLoweringInfo &CLI, + CCState &CCInfo) { + const SelectionDAG &DAG = CLI.DAG; + CallingConv::ID CalleeCC = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); + + unsigned NumArgs = Outs.size(); + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ArgVT = Outs[i].VT; + ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; + + bool UseVarArgCC = false; + if (IsVarArg) { + // On Windows, the fixed arguments in a vararg call are passed in GPRs + // too, so use the vararg CC to force them to integer registers. + if (IsCalleeWin64) { + UseVarArgCC = true; + } else { + UseVarArgCC = !Outs[i].IsFixed; + } + } else { + // Get type of the original argument. + EVT ActualVT = + TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty, + /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + ArgVT = MVT::i8; + else if (ActualMVT == MVT::i16) + ArgVT = MVT::i16; + } + + CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC); + bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } +} + bool AArch64TargetLowering::isEligibleForTailCallOptimization( - SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + const CallLoweringInfo &CLI) const { + CallingConv::ID CalleeCC = CLI.CallConv; if (!mayTailCallThisCC(CalleeCC)) return false; + SDValue Callee = CLI.Callee; + bool IsVarArg = CLI.IsVarArg; + const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + const SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + const SelectionDAG &DAG = CLI.DAG; MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); @@ -5860,7 +6225,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // The check for matching callee-saved regs will determine whether it is // eligible for TCO. if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && - AArch64RegisterInfo::hasSVEArgsOrReturn(&MF)) + MF.getInfo<AArch64FunctionInfo>()->isSVECC()) CallerCC = CallingConv::AArch64_SVE_VectorCall; bool CCMatch = CallerCC == CalleeCC; @@ -5915,30 +6280,14 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // I want anyone implementing a new calling convention to think long and hard // about this assert. - assert((!isVarArg || CalleeCC == CallingConv::C) && + assert((!IsVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); LLVMContext &C = *DAG.getContext(); - if (isVarArg && !Outs.empty()) { - // At least two cases here: if caller is fastcc then we can't have any - // memory arguments (we'd be expected to clean up the stack afterwards). If - // caller is C then we could potentially use its argument area. - - // FIXME: for now we take the most conservative of these in both cases: - // disallow all variadic memory operands. - SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); - - CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); - for (const CCValAssign &ArgLoc : ArgLocs) - if (!ArgLoc.isRegLoc()) - return false; - } - // Check that the call results are passed in the same way. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, - CCAssignFnForCall(CalleeCC, isVarArg), - CCAssignFnForCall(CallerCC, isVarArg))) + CCAssignFnForCall(CalleeCC, IsVarArg), + CCAssignFnForCall(CallerCC, IsVarArg))) return false; // The callee has to preserve all registers the caller needs to preserve. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); @@ -5958,9 +6307,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return true; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); + CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C); + + analyzeCallOperands(*this, Subtarget, CLI, CCInfo); - CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); + if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) { + // When we are musttail, additional checks have been done and we can safely ignore this check + // At least two cases here: if caller is fastcc then we can't have any + // memory arguments (we'd be expected to clean up the stack afterwards). If + // caller is C then we could potentially use its argument area. + + // FIXME: for now we take the most conservative of these in both cases: + // disallow all variadic memory operands. + for (const CCValAssign &ArgLoc : ArgLocs) + if (!ArgLoc.isRegLoc()) + return false; + } const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); @@ -6051,7 +6413,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; - CallingConv::ID CallConv = CLI.CallConv; + CallingConv::ID &CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); @@ -6061,7 +6423,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; bool IsSibCall = false; - bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CallConv); + bool GuardWithBTI = false; + + if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) && + !Subtarget->noBTIAtReturnTwice()) { + GuardWithBTI = FuncInfo->branchTargetEnforcement(); + } // Check callee args/returns for SVE registers and set calling convention // accordingly. @@ -6079,8 +6446,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { // Check if it's really possible to do a tail call. - IsTailCall = isEligibleForTailCallOptimization( - Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); + IsTailCall = isEligibleForTailCallOptimization(CLI); // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: @@ -6101,56 +6467,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); if (IsVarArg) { - // Handle fixed and variable vector arguments differently. - // Variable vector arguments always go into memory. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { - MVT ArgVT = Outs[i].VT; - if (!Outs[i].IsFixed && ArgVT.isScalableVector()) + if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) report_fatal_error("Passing SVE types to variadic functions is " "currently not supported"); - - ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - bool UseVarArgCC = !Outs[i].IsFixed; - // On Windows, the fixed arguments in a vararg call are passed in GPRs - // too, so use the vararg CC to force them to integer registers. - if (IsCalleeWin64) - UseVarArgCC = true; - CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); - bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); - assert(!Res && "Call operand has unhandled type"); - (void)Res; - } - } else { - // At this point, Outs[].VT may already be promoted to i32. To correctly - // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and - // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. - // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here - // we use a special version of AnalyzeCallOperands to pass in ValVT and - // LocVT. - unsigned NumArgs = Outs.size(); - for (unsigned i = 0; i != NumArgs; ++i) { - MVT ValVT = Outs[i].VT; - // Get type of the original argument. - EVT ActualVT = getValueType(DAG.getDataLayout(), - CLI.getArgs()[Outs[i].OrigArgIndex].Ty, - /*AllowUnknown*/ true); - MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; - ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. - if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) - ValVT = MVT::i8; - else if (ActualMVT == MVT::i16) - ValVT = MVT::i16; - - CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); - bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); - assert(!Res && "Call operand has unhandled type"); - (void)Res; } } + analyzeCallOperands(*this, Subtarget, CLI, CCInfo); + // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); @@ -6536,7 +6863,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT); Ops.insert(Ops.begin() + 1, GA); - } + } else if (GuardWithBTI) + CallOpc = AArch64ISD::CALL_BTI; // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); @@ -7313,103 +7641,88 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->hasNEON()) + return SDValue(); + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); SDLoc DL(Op); SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); - if (VT.isScalableVector()) { - if (VT != SrcVT) - return SDValue(); + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); - // copysign(x,y) -> (y & SIGN_MASK) | (x & ~SIGN_MASK) - // - // A possible alternative sequence involves using FNEG_MERGE_PASSTHRU; - // maybe useful for copysign operations with mismatched VTs. - // - // IntVT here is chosen so it's a legal type with the same element width - // as the input. - EVT IntVT = + if (VT.isScalableVector()) + IntVT = getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); - unsigned NumBits = VT.getScalarSizeInBits(); - SDValue SignMask = DAG.getConstant(APInt::getSignMask(NumBits), DL, IntVT); - SDValue InvSignMask = DAG.getNOT(DL, SignMask, IntVT); - SDValue Sign = DAG.getNode(ISD::AND, DL, IntVT, SignMask, - getSVESafeBitCast(IntVT, In2, DAG)); - SDValue Magnitude = DAG.getNode(ISD::AND, DL, IntVT, InvSignMask, - getSVESafeBitCast(IntVT, In1, DAG)); - SDValue IntResult = DAG.getNode(ISD::OR, DL, IntVT, Sign, Magnitude); - return getSVESafeBitCast(VT, IntResult, DAG); - } - if (!Subtarget->hasNEON()) + if (VT != In2.getValueType()) return SDValue(); - if (SrcVT.bitsLT(VT)) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT.bitsGT(VT)) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); + auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) { + if (VT.isScalableVector()) + return getSVESafeBitCast(VT, Op, DAG); - EVT VecVT; - uint64_t EltMask; - SDValue VecVal1, VecVal2; + return DAG.getBitcast(VT, Op); + }; - auto setVecVal = [&] (int Idx) { + SDValue VecVal1, VecVal2; + EVT VecVT; + auto SetVecVal = [&](int Idx = -1) { if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, - DAG.getUNDEF(VecVT), In2); + VecVal1 = + DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); + VecVal2 = + DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); + VecVal1 = BitCast(VecVT, In1, DAG); + VecVal2 = BitCast(VecVT, In2, DAG); } }; - - if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { - VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); - EltMask = 0x80000000ULL; - setVecVal(AArch64::ssub); - } else if (VT == MVT::f64 || VT == MVT::v2f64) { + if (VT.isVector()) { + VecVT = IntVT; + SetVecVal(); + } else if (VT == MVT::f64) { VecVT = MVT::v2i64; - - // We want to materialize a mask with the high bit set, but the AdvSIMD - // immediate moves cannot materialize that in a single instruction for - // 64-bit elements. Instead, materialize zero and then negate it. - EltMask = 0; - - setVecVal(AArch64::dsub); - } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { - VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); - EltMask = 0x8000ULL; - setVecVal(AArch64::hsub); + SetVecVal(AArch64::dsub); + } else if (VT == MVT::f32) { + VecVT = MVT::v4i32; + SetVecVal(AArch64::ssub); + } else if (VT == MVT::f16) { + VecVT = MVT::v8i16; + SetVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } - SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); + unsigned BitWidth = In1.getScalarValueSizeInBits(); + SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT); - // If we couldn't materialize the mask above, then the mask vector will be - // the zero vector, and we need to negate it here. + // We want to materialize a mask with every bit but the high bit set, but the + // AdvSIMD immediate moves cannot materialize that in a single instruction for + // 64-bit elements. Instead, materialize all bits set and then negate that. if (VT == MVT::f64 || VT == MVT::v2f64) { - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); + SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT); + SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV); + SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV); + SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV); } - SDValue Sel = - DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); - + SDValue BSP = + DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2); if (VT == MVT::f16) - return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); + return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP); if (VT == MVT::f32) - return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); - else if (VT == MVT::f64) - return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); - else - return DAG.getNode(ISD::BITCAST, DL, VT, Sel); + return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP); + if (VT == MVT::f64) + return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP); + + return BitCast(VT, BSP, DAG); } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { @@ -7485,7 +7798,8 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); assert(VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)); + useSVEForFixedLengthVectorVT( + VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())); SDLoc DL(Op); SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); @@ -7517,22 +7831,19 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, } if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) { + useSVEForFixedLengthVectorVT( + VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { switch (Opcode) { default: llvm_unreachable("Wrong instruction"); case ISD::SMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); case ISD::SMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); case ISD::UMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); case ISD::UMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); } } @@ -7547,9 +7858,9 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, EVT VT = Op.getValueType(); if (VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, - true); + useSVEForFixedLengthVectorVT( + VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); SDLoc DL(Op); SDValue REVB; @@ -8990,12 +9301,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, if (V.isUndef()) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(V.getOperand(1))) { + !isa<ConstantSDNode>(V.getOperand(1)) || + V.getOperand(0).getValueType().isScalableVector()) { LLVM_DEBUG( dbgs() << "Reshuffle failed: " "a shuffle can only come from building a vector from " - "various elements of other vectors, provided their " - "indices are constant\n"); + "various elements of other fixed-width vectors, provided " + "their indices are constant\n"); return SDValue(); } @@ -9011,10 +9323,72 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, Source->MaxElt = std::max(Source->MaxElt, EltNo); } + // If we have 3 or 4 sources, try to generate a TBL, which will at least be + // better than moving to/from gpr registers for larger vectors. + if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) { + // Construct a mask for the tbl. We may need to adjust the index for types + // larger than i8. + SmallVector<unsigned, 16> Mask; + unsigned OutputFactor = VT.getScalarSizeInBits() / 8; + for (unsigned I = 0; I < NumElts; ++I) { + SDValue V = Op.getOperand(I); + if (V.isUndef()) { + for (unsigned OF = 0; OF < OutputFactor; OF++) + Mask.push_back(-1); + continue; + } + // Set the Mask lanes adjusted for the size of the input and output + // lanes. The Mask is always i8, so it will set OutputFactor lanes per + // output element, adjusted in their positions per input and output types. + unsigned Lane = V.getConstantOperandVal(1); + for (unsigned S = 0; S < Sources.size(); S++) { + if (V.getOperand(0) == Sources[S].Vec) { + unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits(); + unsigned InputBase = 16 * S + Lane * InputSize / 8; + for (unsigned OF = 0; OF < OutputFactor; OF++) + Mask.push_back(InputBase + OF); + break; + } + } + } + + // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to + // v16i8, and the TBLMask + SmallVector<SDValue, 16> TBLOperands; + TBLOperands.push_back(DAG.getConstant(Sources.size() == 3 + ? Intrinsic::aarch64_neon_tbl3 + : Intrinsic::aarch64_neon_tbl4, + dl, MVT::i32)); + for (unsigned i = 0; i < Sources.size(); i++) { + SDValue Src = Sources[i].Vec; + EVT SrcVT = Src.getValueType(); + Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src); + assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) && + "Expected a legally typed vector"); + if (SrcVT.is64BitVector()) + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src, + DAG.getUNDEF(MVT::v8i8)); + TBLOperands.push_back(Src); + } + + SmallVector<SDValue, 16> TBLMask; + for (unsigned i = 0; i < Mask.size(); i++) + TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32)); + assert((Mask.size() == 8 || Mask.size() == 16) && + "Expected a v8i8 or v16i8 Mask"); + TBLOperands.push_back( + DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask)); + + SDValue Shuffle = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, + Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands); + return DAG.getBitcast(VT, Shuffle); + } + if (Sources.size() > 2) { - LLVM_DEBUG( - dbgs() << "Reshuffle failed: currently only do something sane when at " - "most two source vectors are involved\n"); + LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something " + << "sensible when at most two source vectors are " + << "involved\n"); return SDValue(); } @@ -9039,8 +9413,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); - uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); - if (SrcVTSize == VTSize) + TypeSize SrcVTSize = SrcVT.getSizeInBits(); + if (SrcVTSize == TypeSize::Fixed(VTSize)) continue; // This stage of the search produces a source with the same element type as @@ -9049,7 +9423,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - if (SrcVTSize < VTSize) { + if (SrcVTSize.getFixedValue() < VTSize) { assert(2 * SrcVTSize == VTSize); // We can pad out the smaller vector for free, so if it's part of a // shuffle... @@ -9059,7 +9433,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, continue; } - if (SrcVTSize != 2 * VTSize) { + if (SrcVTSize.getFixedValue() != 2 * VTSize) { LLVM_DEBUG( dbgs() << "Reshuffle failed: result vector too small to extract\n"); return SDValue(); @@ -9205,6 +9579,56 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { return true; } +// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from +// v4i32s. This is really a truncate, which we can construct out of (legal) +// concats and truncate nodes. +static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) { + if (V.getValueType() != MVT::v16i8) + return SDValue(); + assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR"); + + for (unsigned X = 0; X < 4; X++) { + // Check the first item in each group is an extract from lane 0 of a v4i32 + // or v4i16. + SDValue BaseExt = V.getOperand(X * 4); + if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + (BaseExt.getOperand(0).getValueType() != MVT::v4i16 && + BaseExt.getOperand(0).getValueType() != MVT::v4i32) || + !isa<ConstantSDNode>(BaseExt.getOperand(1)) || + BaseExt.getConstantOperandVal(1) != 0) + return SDValue(); + SDValue Base = BaseExt.getOperand(0); + // And check the other items are extracts from the same vector. + for (unsigned Y = 1; Y < 4; Y++) { + SDValue Ext = V.getOperand(X * 4 + Y); + if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Ext.getOperand(0) != Base || + !isa<ConstantSDNode>(Ext.getOperand(1)) || + Ext.getConstantOperandVal(1) != Y) + return SDValue(); + } + } + + // Turn the buildvector into a series of truncates and concates, which will + // become uzip1's. Any v4i32s we found get truncated to v4i16, which are + // concat together to produce 2 v8i16. These are both truncated and concat + // together. + SDLoc DL(V); + SDValue Trunc[4] = { + V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0), + V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)}; + for (int I = 0; I < 4; I++) + if (Trunc[I].getValueType() == MVT::v4i32) + Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]); + SDValue Concat0 = + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]); + SDValue Concat1 = + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]); + SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0); + SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1); +} + /// Check if a vector shuffle corresponds to a DUP instructions with a larger /// element width than the vector lane type. If that is the case the function /// returns true and writes the value of the DUP instruction lane operand into @@ -9534,8 +9958,12 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit -/// the specified operations to build the shuffle. -static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, +/// the specified operations to build the shuffle. ID is the perfect-shuffle +//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle +//table entry and LHS/RHS are the immediate inputs for this stage of the +//shuffle. +static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, + SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; @@ -9552,12 +9980,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, OP_VEXT1, OP_VEXT2, OP_VEXT3, - OP_VUZPL, // VUZP, left result - OP_VUZPR, // VUZP, right result - OP_VZIPL, // VZIP, left result - OP_VZIPR, // VZIP, right result - OP_VTRNL, // VTRN, left result - OP_VTRNR // VTRN, right result + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR, // VTRN, right result + OP_MOVLANE // Move lane. RHSID is the lane to move into }; if (OpNum == OP_COPY) { @@ -9567,9 +9996,71 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, return RHS; } + if (OpNum == OP_MOVLANE) { + // Decompose a PerfectShuffle ID to get the Mask for lane Elt + auto getPFIDLane = [](unsigned ID, int Elt) -> int { + assert(Elt < 4 && "Expected Perfect Lanes to be less than 4"); + Elt = 3 - Elt; + while (Elt > 0) { + ID /= 9; + Elt--; + } + return (ID % 9 == 8) ? -1 : ID % 9; + }; + + // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We + // get the lane to move from from the PFID, which is always from the + // original vectors (V1 or V2). + SDValue OpLHS = GeneratePerfectShuffle( + LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + EVT VT = OpLHS.getValueType(); + assert(RHSID < 8 && "Expected a lane index for RHSID!"); + unsigned ExtLane = 0; + SDValue Input; + + // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs + // convert into a higher type. + if (RHSID & 0x4) { + int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1; + if (MaskElt == -1) + MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1; + assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); + ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2); + Input = MaskElt < 2 ? V1 : V2; + if (VT.getScalarSizeInBits() == 16) { + Input = DAG.getBitcast(MVT::v2f32, Input); + OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS); + } else { + assert(VT.getScalarSizeInBits() == 32 && + "Expected 16 or 32 bit shuffle elemements"); + Input = DAG.getBitcast(MVT::v2f64, Input); + OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS); + } + } else { + int MaskElt = getPFIDLane(ID, RHSID); + assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); + ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4); + Input = MaskElt < 4 ? V1 : V2; + // Be careful about creating illegal types. Use f16 instead of i16. + if (VT == MVT::v4i16) { + Input = DAG.getBitcast(MVT::v4f16, Input); + OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS); + } + } + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + Input.getValueType().getVectorElementType(), + Input, DAG.getVectorIdxConstant(ExtLane, dl)); + SDValue Ins = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS, + Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl)); + return DAG.getBitcast(VT, Ins); + } + SDValue OpLHS, OpRHS; - OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); - OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, + RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS, + RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { @@ -9648,14 +10139,16 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, EVT EltVT = Op.getValueType().getVectorElementType(); unsigned BytesPerElt = EltVT.getSizeInBits() / 8; - SmallVector<SDValue, 8> TBLMask; - for (int Val : ShuffleMask) { - for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { - unsigned Offset = Byte + Val * BytesPerElt; - TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); - } + bool Swap = false; + if (V1.isUndef() || isZerosVector(V1.getNode())) { + std::swap(V1, V2); + Swap = true; } + // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill + // out of range values with 0s. We do need to make sure that any out-of-range + // values are really out-of-range for a v16i8 vector. + bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode()); MVT IndexVT = MVT::v8i8; unsigned IndexLen = 8; if (Op.getValueSizeInBits() == 128) { @@ -9663,11 +10156,23 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, IndexLen = 16; } + SmallVector<SDValue, 8> TBLMask; + for (int Val : ShuffleMask) { + for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { + unsigned Offset = Byte + Val * BytesPerElt; + if (Swap) + Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen; + if (IsUndefOrZero && Offset >= IndexLen) + Offset = 255; + TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); + } + } + SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; - if (V2.getNode()->isUndef()) { + if (IsUndefOrZero) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( @@ -9732,6 +10237,10 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, if (ExtIdxInBits % CastedEltBitWidth != 0) return false; + // Can't handle cases where vector size is not 128-bit + if (!Extract.getOperand(0).getValueType().is128BitVector()) + return false; + // Update the lane value by offsetting with the scaled extract index. LaneC += ExtIdxInBits / CastedEltBitWidth; @@ -10014,10 +10523,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - if (Cost <= 4) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG, + dl); } return GenerateTBL(Op, ShuffleMask, DAG); @@ -10025,56 +10532,33 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const { - SDLoc dl(Op); EVT VT = Op.getValueType(); - EVT ElemVT = VT.getScalarType(); - SDValue SplatVal = Op.getOperand(0); if (useSVEForFixedLengthVectorVT(VT)) return LowerToScalableOp(Op, DAG); - // Extend input splat value where needed to fit into a GPR (32b or 64b only) - // FPRs don't have this restriction. - switch (ElemVT.getSimpleVT().SimpleTy) { - case MVT::i1: { - // The only legal i1 vectors are SVE vectors, so we can use SVE-specific - // lowering code. - if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { - // We can hande the zero case during isel. - if (ConstVal->isZero()) - return Op; - if (ConstVal->isOne()) - return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); - } - // The general case of i1. There isn't any natural way to do this, - // so we use some trickery with whilelo. - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); - SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, - DAG.getValueType(MVT::i1)); - SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, - MVT::i64); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, - DAG.getConstant(0, dl, MVT::i64), SplatVal); - } - case MVT::i8: - case MVT::i16: - case MVT::i32: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); - break; - case MVT::i64: - SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); - break; - case MVT::f16: - case MVT::bf16: - case MVT::f32: - case MVT::f64: - // Fine as is - break; - default: - report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); - } + assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 && + "Unexpected vector type!"); + + // We can handle the constant cases during isel. + if (isa<ConstantSDNode>(Op.getOperand(0))) + return Op; - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); + // There isn't a natural way to handle the general i1 case, so we use some + // trickery with whilelo. + SDLoc DL(Op); + SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64); + SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal, + DAG.getValueType(MVT::i1)); + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + if (VT == MVT::nxv1i1) + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1, + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID, + Zero, SplatVal), + Zero); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal); } SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, @@ -10090,18 +10574,17 @@ SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, return SDValue(); // The DUPQ operation is indepedent of element type so normalise to i64s. - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); SDValue Idx128 = Op.getOperand(2); // DUPQ can be used when idx is in range. auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); if (CIdx && (CIdx->getZExtValue() <= 3)) { SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); - SDNode *DUPQ = - DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); - return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); + return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI); } + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); + // The ACLE says this must produce the same result as: // svtbl(data, svadd_x(svptrue_b64(), // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), @@ -10358,20 +10841,6 @@ static bool isAllConstantBuildVector(const SDValue &PotentialBVec, return true; } -static unsigned getIntrinsicID(const SDNode *N) { - unsigned Opcode = N->getOpcode(); - switch (Opcode) { - default: - return Intrinsic::not_intrinsic; - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - if (IID < Intrinsic::num_intrinsics) - return IID; - return Intrinsic::not_intrinsic; - } - } -} - // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a // BUILD_VECTORs with constant element C1, C2 is a constant, and: @@ -10822,6 +11291,12 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, return SDValue(); } + // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from + // v4i32s. This is really a truncate, which we can construct out of (legal) + // concats and truncate nodes. + if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG)) + return M; + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { if (SDValue shuffle = ReconstructShuffle(Op, DAG)) @@ -11121,29 +11596,36 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) return SDValue(); - EVT WideVT; - SDValue ExtVec; + // Here narrow and wide refers to the vector element types. After "casting" + // both vectors must have the same bit length and so because the subvector + // has fewer elements, those elements need to be bigger. + EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount()); + EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount()); + // NOP cast operands to the largest legal vector of the same element count. if (VT.isFloatingPoint()) { - // The InVT type should be legal. We can safely cast the unpacked - // subvector from InVT -> VT. - WideVT = VT; - ExtVec = getSVESafeBitCast(VT, Vec1, DAG); + Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG); + Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG); } else { - // Extend elements of smaller vector... - WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext())); - ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); + // Legal integer vectors are already their largest so Vec0 is fine as is. + Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); } + // To replace the top/bottom half of vector V with vector SubV we widen the + // preserved half of V, concatenate this to SubV (the order depending on the + // half being replaced) and then narrow the result. + SDValue Narrow; if (Idx == 0) { SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); - return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0); - } else if (Idx == InVT.getVectorMinNumElements()) { + Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0); + } else { + assert(Idx == InVT.getVectorMinNumElements() && + "Invalid subvector index!"); SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); - return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec); + Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1); } - return SDValue(); + return getSVESafeBitCast(VT, Narrow, DAG); } if (Idx == 0 && isPackedVectorType(VT, DAG)) { @@ -11249,21 +11731,8 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { - unsigned PFIndexes[4]; - for (unsigned i = 0; i != 4; ++i) { - if (M[i] < 0) - PFIndexes[i] = 8; - else - PFIndexes[i] = M[i]; - } - - // Compute the index in the perfect shuffle table. - unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + - PFIndexes[2] * 9 + PFIndexes[3]; - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - if (Cost <= 4) + unsigned Cost = getPerfectShuffleCost(M); + if (Cost <= 1) return true; } @@ -11360,9 +11829,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, unsigned EltSize = VT.getScalarSizeInBits(); switch (Op.getOpcode()) { - default: - llvm_unreachable("unexpected shift opcode"); - case ISD::SHL: if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); @@ -11405,7 +11871,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, return NegShiftLeft; } - return SDValue(); + llvm_unreachable("unexpected shift opcode"); } static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, @@ -11525,8 +11991,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } - const bool FullFP16 = - static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); + const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); // Make v4f16 (only) fcmp operations utilise vector instructions // v8f16 support will be a litle more complicated @@ -11594,7 +12059,8 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, (Op.getOpcode() != ISD::VECREDUCE_ADD && SrcVT.getVectorElementType() == MVT::i64); if (SrcVT.isScalableVector() || - useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + useSVEForFixedLengthVectorVT( + SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) { if (SrcVT.getVectorElementType() == MVT::i1) return LowerPredReductionToSVE(Op, DAG); @@ -11659,7 +12125,7 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const { - auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); + auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) return SDValue(); @@ -11676,7 +12142,7 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const { - auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); + auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) return SDValue(); @@ -11772,8 +12238,8 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, SDLoc DL(Op); APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue(); - return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), - DL, VT); + return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL, + VT); } /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. @@ -11867,23 +12333,23 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { - PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); + Type *ValTy = I.getParamElementType(0); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); + Info.memVT = MVT::getVT(ValTy); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); + Info.align = DL.getABITypeAlign(ValTy); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { - PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); + Type *ValTy = I.getParamElementType(1); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); + Info.memVT = MVT::getVT(ValTy); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); + Info.align = DL.getABITypeAlign(ValTy); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -11906,22 +12372,23 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_sve_ldnt1: { - PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); + Type *ElTy = cast<VectorType>(I.getType())->getElementType(); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(I.getType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); + Info.align = DL.getABITypeAlign(ElTy); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; return true; } case Intrinsic::aarch64_sve_stnt1: { - PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType()); + Type *ElTy = + cast<VectorType>(I.getArgOperand(0)->getType())->getElementType(); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(I.getOperand(0)->getType()); Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType()); + Info.align = DL.getABITypeAlign(ElTy); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; return true; } @@ -12007,8 +12474,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { Instruction *User = I->user_back(); - if (User && - !(User->getOpcode() == Instruction::FSub || + if (!(User->getOpcode() == Instruction::FSub || User->getOpcode() == Instruction::FAdd)) return true; @@ -12194,9 +12660,6 @@ static bool isSplatShuffle(Value *V) { /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). bool AArch64TargetLowering::shouldSinkOperands( Instruction *I, SmallVectorImpl<Use *> &Ops) const { - if (!I->getType()->isVectorTy()) - return false; - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { case Intrinsic::aarch64_neon_smull: @@ -12208,6 +12671,12 @@ bool AArch64TargetLowering::shouldSinkOperands( } LLVM_FALLTHROUGH; + case Intrinsic::fma: + if (isa<VectorType>(I->getType()) && + cast<VectorType>(I->getType())->getElementType()->isHalfTy() && + !Subtarget->hasFullFP16()) + return false; + LLVM_FALLTHROUGH; case Intrinsic::aarch64_neon_sqdmull: case Intrinsic::aarch64_neon_sqdmulh: case Intrinsic::aarch64_neon_sqrdmulh: @@ -12217,7 +12686,52 @@ bool AArch64TargetLowering::shouldSinkOperands( if (isSplatShuffle(II->getOperand(1))) Ops.push_back(&II->getOperandUse(1)); return !Ops.empty(); - + case Intrinsic::aarch64_sme_write_horiz: + case Intrinsic::aarch64_sme_write_vert: + case Intrinsic::aarch64_sme_writeq_horiz: + case Intrinsic::aarch64_sme_writeq_vert: { + auto *Idx = dyn_cast<Instruction>(II->getOperand(1)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(1)); + return true; + } + case Intrinsic::aarch64_sme_read_horiz: + case Intrinsic::aarch64_sme_read_vert: + case Intrinsic::aarch64_sme_readq_horiz: + case Intrinsic::aarch64_sme_readq_vert: + case Intrinsic::aarch64_sme_ld1b_vert: + case Intrinsic::aarch64_sme_ld1h_vert: + case Intrinsic::aarch64_sme_ld1w_vert: + case Intrinsic::aarch64_sme_ld1d_vert: + case Intrinsic::aarch64_sme_ld1q_vert: + case Intrinsic::aarch64_sme_st1b_vert: + case Intrinsic::aarch64_sme_st1h_vert: + case Intrinsic::aarch64_sme_st1w_vert: + case Intrinsic::aarch64_sme_st1d_vert: + case Intrinsic::aarch64_sme_st1q_vert: + case Intrinsic::aarch64_sme_ld1b_horiz: + case Intrinsic::aarch64_sme_ld1h_horiz: + case Intrinsic::aarch64_sme_ld1w_horiz: + case Intrinsic::aarch64_sme_ld1d_horiz: + case Intrinsic::aarch64_sme_ld1q_horiz: + case Intrinsic::aarch64_sme_st1b_horiz: + case Intrinsic::aarch64_sme_st1h_horiz: + case Intrinsic::aarch64_sme_st1w_horiz: + case Intrinsic::aarch64_sme_st1d_horiz: + case Intrinsic::aarch64_sme_st1q_horiz: { + auto *Idx = dyn_cast<Instruction>(II->getOperand(3)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(3)); + return true; + } + case Intrinsic::aarch64_neon_pmull: + if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) + return false; + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; case Intrinsic::aarch64_neon_pmull64: if (!areOperandsOfVmullHighP64(II->getArgOperand(0), II->getArgOperand(1))) @@ -12225,12 +12739,14 @@ bool AArch64TargetLowering::shouldSinkOperands( Ops.push_back(&II->getArgOperandUse(0)); Ops.push_back(&II->getArgOperandUse(1)); return true; - default: return false; } } + if (!I->getType()->isVectorTy()) + return false; + switch (I->getOpcode()) { case Instruction::Sub: case Instruction::Add: { @@ -12745,12 +13261,15 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, assert(VT.isScalableVector() && "Can only lower scalable vectors"); unsigned N, Opcode; - static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = { - {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, - {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, - {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; - - std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; + static const std::pair<unsigned, std::pair<unsigned, unsigned>> + IntrinsicMap[] = { + {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}}, + {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; + + std::tie(N, Opcode) = llvm::find_if(IntrinsicMap, [&](auto P) { + return P.first == Intrinsic; + })->second; assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 && "invalid tuple vector type!"); @@ -12850,7 +13369,7 @@ bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine, // if the folding leads to worse code. bool AArch64TargetLowering::isMulAddWithConstProfitable( - const SDValue &AddNode, const SDValue &ConstNode) const { + SDValue AddNode, SDValue ConstNode) const { // Let the DAGCombiner decide for vector types and large types. const EVT VT = AddNode.getValueType(); if (VT.isVector() || VT.getScalarSizeInBits() > 64) @@ -13025,6 +13544,28 @@ AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, return true; } +bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { + assert(((N->getOpcode() == ISD::SHL && + N->getOperand(0).getOpcode() == ISD::SRL) || + (N->getOpcode() == ISD::SRL && + N->getOperand(0).getOpcode() == ISD::SHL)) && + "Expected shift-shift mask"); + // Don't allow multiuse shift folding with the same shift amount. + if (!N->getOperand(0)->hasOneUse()) + return false; + + // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns. + EVT VT = N->getValueType(0); + if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) { + auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); + auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); + return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue()); + } + + return true; +} + bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); @@ -13221,6 +13762,61 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); } +// Given an (integer) vecreduce, we know the order of the inputs does not +// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) +// into UADDV(UADDLP(x)). This can also happen through an extra add, where we +// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))). +static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { + auto DetectAddExtract = [&](SDValue A) { + // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning + // UADDLP(x) if found. + if (A.getOpcode() != ISD::ADD) + return SDValue(); + EVT VT = A.getValueType(); + SDValue Op0 = A.getOperand(0); + SDValue Op1 = A.getOperand(1); + if (Op0.getOpcode() != Op0.getOpcode() || + (Op0.getOpcode() != ISD::ZERO_EXTEND && + Op0.getOpcode() != ISD::SIGN_EXTEND)) + return SDValue(); + SDValue Ext0 = Op0.getOperand(0); + SDValue Ext1 = Op1.getOperand(0); + if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Ext0.getOperand(0) != Ext1.getOperand(0)) + return SDValue(); + // Check that the type is twice the add types, and the extract are from + // upper/lower parts of the same source. + if (Ext0.getOperand(0).getValueType().getVectorNumElements() != + VT.getVectorNumElements() * 2) + return SDValue(); + if ((Ext0.getConstantOperandVal(1) != 0 && + Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) && + (Ext1.getConstantOperandVal(1) != 0 && + Ext0.getConstantOperandVal(1) != VT.getVectorNumElements())) + return SDValue(); + unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP + : AArch64ISD::SADDLP; + return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0)); + }; + + SDValue A = N->getOperand(0); + if (SDValue R = DetectAddExtract(A)) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R); + if (A.getOpcode() == ISD::ADD) { + if (SDValue R = DetectAddExtract(A.getOperand(0))) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), + DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, + A.getOperand(1))); + if (SDValue R = DetectAddExtract(A.getOperand(1))) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), + DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, + A.getOperand(0))); + } + return SDValue(); +} + + static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -13279,6 +13875,60 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } +SDValue +AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const { + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N, 0); // Lower SREM as SREM + + EVT VT = N->getValueType(0); + + // For scalable and fixed types, mark them as cheap so we can handle it much + // later. This allows us to handle larger than legal types. + if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) + return SDValue(N, 0); + + // fold (srem X, pow2) + if ((VT != MVT::i32 && VT != MVT::i64) || + !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) + return SDValue(); + + unsigned Lg2 = Divisor.countTrailingZeros(); + if (Lg2 == 0) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue CCVal, CSNeg; + if (Lg2 == 1) { + SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL); + SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); + CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp); + + Created.push_back(Cmp.getNode()); + Created.push_back(And.getNode()); + } else { + SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); + SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); + SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne); + CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal, + Negs.getValue(1)); + + Created.push_back(Negs.getNode()); + Created.push_back(AndPos.getNode()); + Created.push_back(AndNeg.getNode()); + } + + return CSNeg; +} + static bool IsSVECntIntrinsic(SDValue S) { switch(getIntrinsicID(S.getNode())) { default: @@ -13300,11 +13950,10 @@ static bool IsSVECntIntrinsic(SDValue S) { /// operations need a bit more inspection to get this information. /// /// \param Extend The SDNode from the DAG that represents the extend operation -/// \param DAG The SelectionDAG hosting the \p Extend node /// /// \returns The type representing the \p Extend source type, or \p MVT::Other /// if no valid type can be determined -static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) { +static EVT calculatePreExtendType(SDValue Extend) { switch (Extend.getOpcode()) { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -13337,102 +13986,90 @@ static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) { default: return MVT::Other; } - - llvm_unreachable("Code path unhandled in calculatePreExtendType!"); } -/// Combines a dup(sext/zext) node pattern into sext/zext(dup) -/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt -static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, - SelectionDAG &DAG) { - - ShuffleVectorSDNode *ShuffleNode = - dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode()); - if (!ShuffleNode) - return SDValue(); - - // Ensuring the mask is zero before continuing - if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0) - return SDValue(); - - SDValue InsertVectorElt = VectorShuffle.getOperand(0); - - if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT) - return SDValue(); - - SDValue InsertLane = InsertVectorElt.getOperand(2); - ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode()); - // Ensures the insert is inserting into lane 0 - if (!Constant || Constant->getZExtValue() != 0) +/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern +/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector +/// SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) { + EVT VT = BV.getValueType(); + if (BV.getOpcode() != ISD::BUILD_VECTOR && + BV.getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); - SDValue Extend = InsertVectorElt.getOperand(1); + // Use the first item in the buildvector/shuffle to get the size of the + // extend, and make sure it looks valid. + SDValue Extend = BV->getOperand(0); unsigned ExtendOpcode = Extend.getOpcode(); - bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || ExtendOpcode == ISD::SIGN_EXTEND_INREG || ExtendOpcode == ISD::AssertSext; if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) return SDValue(); - - EVT TargetType = VectorShuffle.getValueType(); - EVT PreExtendType = calculatePreExtendType(Extend, DAG); - - if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 && - TargetType != MVT::v2i64) || - (PreExtendType == MVT::Other)) + // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure + // calculatePreExtendType will work without issue. + if (BV.getOpcode() == ISD::VECTOR_SHUFFLE && + ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND) return SDValue(); // Restrict valid pre-extend data type - if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 && - PreExtendType != MVT::i32) - return SDValue(); - - EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType); - - if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount()) - return SDValue(); - - if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2) + EVT PreExtendType = calculatePreExtendType(Extend); + if (PreExtendType == MVT::Other || + PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2) return SDValue(); - SDLoc DL(VectorShuffle); - - SDValue InsertVectorNode = DAG.getNode( - InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT), - DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType), - DAG.getConstant(0, DL, MVT::i64)); - - std::vector<int> ShuffleMask(TargetType.getVectorNumElements()); - - SDValue VectorShuffleNode = - DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode, - DAG.getUNDEF(PreExtendVT), ShuffleMask); - - SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - DL, TargetType, VectorShuffleNode); + // Make sure all other operands are equally extended + for (SDValue Op : drop_begin(BV->ops())) { + if (Op.isUndef()) + continue; + unsigned Opc = Op.getOpcode(); + bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG || + Opc == ISD::AssertSext; + if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType) + return SDValue(); + } - return ExtendNode; + SDValue NBV; + SDLoc DL(BV); + if (BV.getOpcode() == ISD::BUILD_VECTOR) { + EVT PreExtendVT = VT.changeVectorElementType(PreExtendType); + EVT PreExtendLegalType = + PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType; + SmallVector<SDValue, 8> NewOps; + for (SDValue Op : BV->ops()) + NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType) + : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, + PreExtendLegalType)); + NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps); + } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE + EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType()); + NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0), + BV.getOperand(1).isUndef() + ? DAG.getUNDEF(PreExtendVT) + : BV.getOperand(1).getOperand(0), + cast<ShuffleVectorSDNode>(BV)->getMask()); + } + return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV); } /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { // If the value type isn't a vector, none of the operands are going to be dups - if (!Mul->getValueType(0).isVector()) + EVT VT = Mul->getValueType(0); + if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64) return SDValue(); - SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG); - SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG); + SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG); + SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG); // Neither operands have been changed, don't make any further changes if (!Op0 && !Op1) return SDValue(); SDLoc DL(Mul); - return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0), - Op0 ? Op0 : Mul->getOperand(0), + return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0), Op1 ? Op1 : Mul->getOperand(1)); } @@ -13649,7 +14286,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, !cast<LoadSDNode>(N0)->isVolatile()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), LN0->getAlignment(), + LN0->getPointerInfo(), LN0->getAlign(), LN0->getMemOperand()->getFlags()); // Make sure successors of the original load stay after it by updating them @@ -13676,8 +14313,10 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); SDValue Op = N->getOperand(0); - if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || - Op.getOpcode() != ISD::FMUL) + if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector()) return SDValue(); SDValue ConstVec = Op->getOperand(1); @@ -13713,7 +14352,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() == ISD::FP_TO_SINT_SAT || N->getOpcode() == ISD::FP_TO_UINT_SAT) { EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT(); - if (SatVT.getScalarSizeInBits() != IntBits) + if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits) return SDValue(); } @@ -13956,15 +14595,85 @@ static SDValue tryCombineToBSL(SDNode *N, return SDValue(); } +// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to +// convert to csel(ccmp(.., cc0)), depending on cc1: + +// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) +// => +// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0)) +// +// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) +// => +// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0)) +static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue CSel0 = N->getOperand(0); + SDValue CSel1 = N->getOperand(1); + + if (CSel0.getOpcode() != AArch64ISD::CSEL || + CSel1.getOpcode() != AArch64ISD::CSEL) + return SDValue(); + + if (!CSel0->hasOneUse() || !CSel1->hasOneUse()) + return SDValue(); + + if (!isNullConstant(CSel0.getOperand(0)) || + !isOneConstant(CSel0.getOperand(1)) || + !isNullConstant(CSel1.getOperand(0)) || + !isOneConstant(CSel1.getOperand(1))) + return SDValue(); + + SDValue Cmp0 = CSel0.getOperand(3); + SDValue Cmp1 = CSel1.getOperand(3); + AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2); + AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2); + if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse()) + return SDValue(); + if (Cmp1.getOpcode() != AArch64ISD::SUBS && + Cmp0.getOpcode() == AArch64ISD::SUBS) { + std::swap(Cmp0, Cmp1); + std::swap(CC0, CC1); + } + + if (Cmp1.getOpcode() != AArch64ISD::SUBS) + return SDValue(); + + SDLoc DL(N); + SDValue CCmp; + + if (N->getOpcode() == ISD::AND) { + AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); + SDValue Condition = DAG.getConstant(InvCC0, DL, MVT_CC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), + Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); + } else { + SDLoc DL(N); + AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1); + SDValue Condition = DAG.getConstant(CC0, DL, MVT_CC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1); + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), + Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); + } + return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), + CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), + CCmp); +} + static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (SDValue R = performANDORCSELCombine(N, DAG)) + return R; + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); + // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; @@ -14015,7 +14724,7 @@ static SDValue performSVEAndCombine(SDNode *N, SDValue UnpkOp = Src->getOperand(0); SDValue Dup = N->getOperand(1); - if (Dup.getOpcode() != AArch64ISD::DUP) + if (Dup.getOpcode() != ISD::SPLAT_VECTOR) return SDValue(); SDLoc DL(N); @@ -14038,8 +14747,7 @@ static SDValue performSVEAndCombine(SDNode *N, // Otherwise, make sure we propagate the AND to the operand // of the unpack - Dup = DAG.getNode(AArch64ISD::DUP, DL, - UnpkOp->getValueType(0), + Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0), DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); SDValue And = DAG.getNode(ISD::AND, DL, @@ -14097,20 +14805,34 @@ static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); - if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) + + if (SDValue R = performANDORCSELCombine(N, DAG)) + return R; + + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); + // Although NEON has no EORV instruction, when only the least significant bit + // is required the operation is synonymous with ADDV. + if (LHS.getOpcode() == ISD::VECREDUCE_XOR && isOneConstant(RHS) && + LHS.getOperand(0).getValueType().isFixedLengthVector() && + LHS.hasOneUse()) { + SDLoc DL(N); + SDValue ADDV = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, LHS.getOperand(0)); + return DAG.getNode(ISD::AND, DL, VT, ADDV, RHS); + } + if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); // The combining code below works only for NEON vectors. In particular, it // does not work for SVE when dealing with vectors wider than 128 bits. - if (!(VT.is64BitVector() || VT.is128BitVector())) + if (!VT.is64BitVector() && !VT.is128BitVector()) return SDValue(); - BuildVectorSDNode *BVN = - dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode()); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); if (!BVN) return SDValue(); @@ -14141,107 +14863,125 @@ static SDValue performANDCombine(SDNode *N, return SDValue(); } -// Attempt to form urhadd(OpA, OpB) from -// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)) -// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)). -// The original form of the first expression is -// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the -// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)). -// Before this function is called the srl will have been lowered to -// AArch64ISD::VLSHR. -// This pass can also recognize signed variants of the patterns that use sign -// extension instead of zero extension and form a srhadd(OpA, OpB) or a -// shadd(OpA, OpB) from them. -static SDValue -performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - EVT VT = N->getValueType(0); +static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { + switch (Opcode) { + case ISD::STRICT_FADD: + case ISD::FADD: + return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; + case ISD::ADD: + return VT == MVT::i64; + default: + return false; + } +} - // Since we are looking for a right shift by a constant value of 1 and we are - // operating on types at least 16 bits in length (sign/zero extended OpA and - // OpB, which are at least 8 bits), it follows that the truncate will always - // discard the shifted-in bit and therefore the right shift will be logical - // regardless of the signedness of OpA and OpB. - SDValue Shift = N->getOperand(0); - if (Shift.getOpcode() != AArch64ISD::VLSHR) +static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, + AArch64CC::CondCode Cond); + +static bool isPredicateCCSettingOp(SDValue N) { + if ((N.getOpcode() == ISD::SETCC) || + (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN && + (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt || + // get_active_lane_mask is lowered to a whilelo instruction. + N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask))) + return true; + + return false; +} + +// Materialize : i1 = extract_vector_elt t37, Constant:i64<0> +// ... into: "ptrue p, all" + PTEST +static SDValue +performFirstTrueTestVectorCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); + // Make sure PTEST can be legalised with illegal types. + if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) return SDValue(); - // Is the right shift using an immediate value of 1? - uint64_t ShiftAmount = Shift.getConstantOperandVal(1); - if (ShiftAmount != 1) + SDValue N0 = N->getOperand(0); + EVT VT = N0.getValueType(); + + if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 || + !isNullConstant(N->getOperand(1))) return SDValue(); - SDValue ExtendOpA, ExtendOpB; - SDValue ShiftOp0 = Shift.getOperand(0); - unsigned ShiftOp0Opc = ShiftOp0.getOpcode(); - if (ShiftOp0Opc == ISD::SUB) { + // Restricted the DAG combine to only cases where we're extracting from a + // flag-setting operation. + if (!isPredicateCCSettingOp(N0)) + return SDValue(); - SDValue Xor = ShiftOp0.getOperand(1); - if (Xor.getOpcode() != ISD::XOR) - return SDValue(); + // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0 + SelectionDAG &DAG = DCI.DAG; + SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all); + return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE); +} - // Is the XOR using a constant amount of all ones in the right hand side? - uint64_t C; - if (!isAllConstantBuildVector(Xor.getOperand(1), C)) - return SDValue(); +// Materialize : Idx = (add (mul vscale, NumEls), -1) +// i1 = extract_vector_elt t37, Constant:i64<Idx> +// ... into: "ptrue p, all" + PTEST +static SDValue +performLastTrueTestVectorCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); + // Make sure PTEST is legal types. + if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) + return SDValue(); - unsigned ElemSizeInBits = VT.getScalarSizeInBits(); - APInt CAsAPInt(ElemSizeInBits, C); - if (CAsAPInt != APInt::getAllOnes(ElemSizeInBits)) - return SDValue(); + SDValue N0 = N->getOperand(0); + EVT OpVT = N0.getValueType(); - ExtendOpA = Xor.getOperand(0); - ExtendOpB = ShiftOp0.getOperand(0); - } else if (ShiftOp0Opc == ISD::ADD) { - ExtendOpA = ShiftOp0.getOperand(0); - ExtendOpB = ShiftOp0.getOperand(1); - } else + if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) return SDValue(); - unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); - unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); - if (!(ExtendOpAOpc == ExtendOpBOpc && - (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) + // Idx == (add (mul vscale, NumEls), -1) + SDValue Idx = N->getOperand(1); + if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1))) return SDValue(); - // Is the result of the right shift being truncated to the same value type as - // the original operands, OpA and OpB? - SDValue OpA = ExtendOpA.getOperand(0); - SDValue OpB = ExtendOpB.getOperand(0); - EVT OpAVT = OpA.getValueType(); - assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); - if (!(VT == OpAVT && OpAVT == OpB.getValueType())) + SDValue VS = Idx.getOperand(0); + if (VS.getOpcode() != ISD::VSCALE) return SDValue(); - SDLoc DL(N); - bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; - bool IsRHADD = ShiftOp0Opc == ISD::SUB; - unsigned HADDOpc = IsSignExtend - ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD) - : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD); - SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB); + unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue(); + if (VS.getConstantOperandVal(0) != NumEls) + return SDValue(); - return ResultHADD; + // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0 + SelectionDAG &DAG = DCI.DAG; + SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all); + return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE); } -static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { - switch (Opcode) { - case ISD::FADD: - return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; - case ISD::ADD: - return VT == MVT::i64; - default: - return false; - } -} +static SDValue +performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); + if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget)) + return Res; + if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget)) + return Res; -static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) { + SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1); EVT VT = N->getValueType(0); - const bool FullFP16 = - static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); + const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); + bool IsStrict = N0->isStrictFPOpcode(); + + // extract(dup x) -> x + if (N0.getOpcode() == AArch64ISD::DUP) + return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); // Rewrite for pairwise fadd pattern // (f32 (extract_vector_elt @@ -14250,11 +14990,14 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) { // -> // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) // (extract_vector_elt (vXf32 Other) 1)) + // For strict_fadd we need to make sure the old strict_fadd can be deleted, so + // we can only do this when it's used only by the extract_vector_elt. if (ConstantN1 && ConstantN1->getZExtValue() == 0 && - hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) { + hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) && + (!IsStrict || N0.hasOneUse())) { SDLoc DL(N0); - SDValue N00 = N0->getOperand(0); - SDValue N01 = N0->getOperand(1); + SDValue N00 = N0->getOperand(IsStrict ? 1 : 0); + SDValue N01 = N0->getOperand(IsStrict ? 2 : 1); ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01); SDValue Other = N00; @@ -14267,11 +15010,23 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) { if (Shuffle && Shuffle->getMaskElt(0) == 1 && Other == Shuffle->getOperand(0)) { - return DAG.getNode(N0->getOpcode(), DL, VT, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, - DAG.getConstant(0, DL, MVT::i64)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, - DAG.getConstant(1, DL, MVT::i64))); + SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, + DAG.getConstant(0, DL, MVT::i64)); + SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, + DAG.getConstant(1, DL, MVT::i64)); + if (!IsStrict) + return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2); + + // For strict_fadd we need uses of the final extract_vector to be replaced + // with the strict_fadd, but we also need uses of the chain output of the + // original strict_fadd to use the chain output of the new strict_fadd as + // otherwise it may not be deleted. + SDValue Ret = DAG.getNode(N0->getOpcode(), DL, + {VT, MVT::Other}, + {N0->getOperand(0), Extract1, Extract2}); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1)); + return SDValue(N, 0); } } @@ -14321,25 +15076,61 @@ static SDValue performConcatVectorsCombine(SDNode *N, } } + if (N->getOperand(0).getValueType() == MVT::v4i8) { + // If we have a concat of v4i8 loads, convert them to a buildvector of f32 + // loads to prevent having to go through the v4i8 load legalization that + // needs to extend each element into a larger type. + if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) { + if (V.getValueType() != MVT::v4i8) + return false; + if (V.isUndef()) + return true; + LoadSDNode *LD = dyn_cast<LoadSDNode>(V); + return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() && + LD->getExtensionType() == ISD::NON_EXTLOAD; + })) { + EVT NVT = + EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands()); + SmallVector<SDValue> Ops; + + for (unsigned i = 0; i < N->getNumOperands(); i++) { + SDValue V = N->getOperand(i); + if (V.isUndef()) + Ops.push_back(DAG.getUNDEF(MVT::f32)); + else { + LoadSDNode *LD = cast<LoadSDNode>(V); + SDValue NewLoad = + DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(), + LD->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + Ops.push_back(NewLoad); + } + } + return DAG.getBitcast(N->getValueType(0), + DAG.getBuildVector(NVT, dl, Ops)); + } + } + + // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); - // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted - // subvectors from the same original vectors. Combine these into a single - // [us]rhadd or [us]hadd that operates on the two original vectors. Example: - // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), - // extract_subvector (v16i8 OpB, - // <0>))), - // (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>), - // extract_subvector (v16i8 OpB, - // <8>))))) + // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use + // extracted subvectors from the same original vectors. Combine these into a + // single avg that operates on the two original vectors. + // avgceil is the target independant name for rhadd, avgfloor is a hadd. + // Example: + // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>), + // extract_subvector (v16i8 OpB, <0>))), + // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>), + // extract_subvector (v16i8 OpB, <8>))))) // -> - // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) + // (v16i8(avgceils(v16i8 OpA, v16i8 OpB))) if (N->getNumOperands() == 2 && N0Opc == N1Opc && - (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD || - N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) { + (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS || + N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); SDValue N10 = N1->getOperand(0); @@ -14411,6 +15202,29 @@ static SDValue performConcatVectorsCombine(SDNode *N, } static SDValue +performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue V = N->getOperand(0); + + // NOTE: This combine exists in DAGCombiner, but that version's legality check + // blocks this combine because the non-const case requires custom lowering. + // + // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const) + if (V.getOpcode() == ISD::SPLAT_VECTOR) + if (isa<ConstantSDNode>(V.getOperand(0))) + return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0)); + + return SDValue(); +} + +static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc DL(N); @@ -14470,33 +15284,34 @@ static SDValue tryCombineFixedPointConvert(SDNode *N, // Check the operand and see if it originates from a lane extract. SDValue Op1 = N->getOperand(1); - if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - // Yep, no additional predication needed. Perform the transform. - SDValue IID = N->getOperand(0); - SDValue Shift = N->getOperand(2); - SDValue Vec = Op1.getOperand(0); - SDValue Lane = Op1.getOperand(1); - EVT ResTy = N->getValueType(0); - EVT VecResTy; - SDLoc DL(N); + if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); - // The vector width should be 128 bits by the time we get here, even - // if it started as 64 bits (the extract_vector handling will have - // done so). - assert(Vec.getValueSizeInBits() == 128 && - "unexpected vector size on extract_vector_elt!"); - if (Vec.getValueType() == MVT::v4i32) - VecResTy = MVT::v4f32; - else if (Vec.getValueType() == MVT::v2i64) - VecResTy = MVT::v2f64; - else - llvm_unreachable("unexpected vector type!"); + // Yep, no additional predication needed. Perform the transform. + SDValue IID = N->getOperand(0); + SDValue Shift = N->getOperand(2); + SDValue Vec = Op1.getOperand(0); + SDValue Lane = Op1.getOperand(1); + EVT ResTy = N->getValueType(0); + EVT VecResTy; + SDLoc DL(N); - SDValue Convert = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); - } - return SDValue(); + // The vector width should be 128 bits by the time we get here, even + // if it started as 64 bits (the extract_vector handling will have + // done so). Bail if it is not. + if (Vec.getValueSizeInBits() != 128) + return SDValue(); + + if (Vec.getValueType() == MVT::v4i32) + VecResTy = MVT::v4f32; + else if (Vec.getValueType() == MVT::v2i64) + VecResTy = MVT::v2f64; + else + return SDValue(); + + SDValue Convert = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); } // AArch64 high-vector "long" operations are formed by performing the non-high @@ -14515,6 +15330,11 @@ static SDValue tryCombineFixedPointConvert(SDNode *N, // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold // similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { + MVT VT = N.getSimpleValueType(); + if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N.getConstantOperandVal(1) == 0) + N = N.getOperand(0); + switch (N.getOpcode()) { case AArch64ISD::DUP: case AArch64ISD::DUPLANE8: @@ -14535,18 +15355,19 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { return SDValue(); } - MVT NarrowTy = N.getSimpleValueType(); - if (!NarrowTy.is64BitVector()) + if (!VT.is64BitVector()) return SDValue(); - MVT ElementTy = NarrowTy.getVectorElementType(); - unsigned NumElems = NarrowTy.getVectorNumElements(); - MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); + SDLoc DL(N); + unsigned NumElems = VT.getVectorNumElements(); + if (N.getValueType().is64BitVector()) { + MVT ElementTy = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); + N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops()); + } - SDLoc dl(N); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, - DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), - DAG.getConstant(NumElems, dl, MVT::i64)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N, + DAG.getConstant(NumElems, DL, MVT::i64)); } static bool isEssentiallyExtractHighSubvector(SDValue N) { @@ -14696,7 +15517,7 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { } // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) -static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); // Only scalar integer and vector types. if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) @@ -14732,6 +15553,81 @@ static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, DL, MVT::i64)); } +/// Perform the scalar expression combine in the form of: +/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc) +/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc) +static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Handle commutivity. + if (LHS.getOpcode() != AArch64ISD::CSEL && + LHS.getOpcode() != AArch64ISD::CSNEG) { + std::swap(LHS, RHS); + if (LHS.getOpcode() != AArch64ISD::CSEL && + LHS.getOpcode() != AArch64ISD::CSNEG) { + return SDValue(); + } + } + + if (!LHS.hasOneUse()) + return SDValue(); + + AArch64CC::CondCode AArch64CC = + static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2)); + + // The CSEL should include a const one operand, and the CSNEG should include + // One or NegOne operand. + ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0)); + ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + if (!CTVal || !CFVal) + return SDValue(); + + if (!(LHS.getOpcode() == AArch64ISD::CSEL && + (CTVal->isOne() || CFVal->isOne())) && + !(LHS.getOpcode() == AArch64ISD::CSNEG && + (CTVal->isOne() || CFVal->isAllOnes()))) + return SDValue(); + + // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc) + if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() && + !CFVal->isOne()) { + std::swap(CTVal, CFVal); + AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); + } + + SDLoc DL(N); + // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc) + if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() && + !CFVal->isAllOnes()) { + APInt C = -1 * CFVal->getAPIntValue(); + CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT)); + CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT)); + AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); + } + + // It might be neutral for larger constants, as the immediate need to be + // materialized in a register. + APInt ADDC = CTVal->getAPIntValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) + return SDValue(); + + assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) || + (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) && + "Unexpected constant value"); + + SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0)); + SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32); + SDValue Cmp = LHS.getOperand(3); + + return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp); +} + // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y) static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -14755,6 +15651,49 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) { Dot.getOperand(2)); } +static bool isNegatedInteger(SDValue Op) { + return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)); +} + +static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Zero = DAG.getConstant(0, DL, VT); + return DAG.getNode(ISD::SUB, DL, VT, Zero, Op); +} + +// Try to fold +// +// (neg (csel X, Y)) -> (csel (neg X), (neg Y)) +// +// The folding helps csel to be matched with csneg without generating +// redundant neg instruction, which includes negation of the csel expansion +// of abs node lowered by lowerABS. +static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) { + if (!isNegatedInteger(SDValue(N, 0))) + return SDValue(); + + SDValue CSel = N->getOperand(1); + if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse()) + return SDValue(); + + SDValue N0 = CSel.getOperand(0); + SDValue N1 = CSel.getOperand(1); + + // If both of them is not negations, it's not worth the folding as it + // introduces two additional negations while reducing one negation. + if (!isNegatedInteger(N0) && !isNegatedInteger(N1)) + return SDValue(); + + SDValue N0N = getNegatedInteger(N0, DAG); + SDValue N1N = getNegatedInteger(N1, DAG); + + SDLoc DL(N); + EVT VT = CSel.getValueType(); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2), + CSel.getOperand(3)); +} + // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: @@ -14808,14 +15747,120 @@ static SDValue performAddSubLongCombine(SDNode *N, return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } +static bool isCMP(SDValue Op) { + return Op.getOpcode() == AArch64ISD::SUBS && + !Op.getNode()->hasAnyUseOfValue(0); +} + +// (CSEL 1 0 CC Cond) => CC +// (CSEL 0 1 CC Cond) => !CC +static Optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) { + if (Op.getOpcode() != AArch64ISD::CSEL) + return None; + auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2)); + if (CC == AArch64CC::AL || CC == AArch64CC::NV) + return None; + SDValue OpLHS = Op.getOperand(0); + SDValue OpRHS = Op.getOperand(1); + if (isOneConstant(OpLHS) && isNullConstant(OpRHS)) + return CC; + if (isNullConstant(OpLHS) && isOneConstant(OpRHS)) + return getInvertedCondCode(CC); + + return None; +} + +// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry) +// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry) +static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) { + SDValue CmpOp = Op->getOperand(2); + if (!isCMP(CmpOp)) + return SDValue(); + + if (IsAdd) { + if (!isOneConstant(CmpOp.getOperand(1))) + return SDValue(); + } else { + if (!isNullConstant(CmpOp.getOperand(0))) + return SDValue(); + } + + SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1); + auto CC = getCSETCondCode(CsetOp); + if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO)) + return SDValue(); + + return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(), + Op->getOperand(0), Op->getOperand(1), + CsetOp.getOperand(3)); +} + +// (ADC x 0 cond) => (CINC x HS cond) +static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Cond = N->getOperand(2); + + if (!isNullConstant(RHS)) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // (CINC x cc cond) <=> (CSINC x x !cc cond) + SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32); + return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); +} + +// Transform vector add(zext i8 to i32, zext i8 to i32) +// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32) +// This allows extra uses of saddl/uaddl at the lower vector widths, and less +// extends. +static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 || + (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) || + (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) || + N->getOperand(0).getOperand(0).getValueType() != + N->getOperand(1).getOperand(0).getValueType()) + return SDValue(); + + SDValue N0 = N->getOperand(0).getOperand(0); + SDValue N1 = N->getOperand(1).getOperand(0); + EVT InVT = N0.getValueType(); + + EVT S1 = InVT.getScalarType(); + EVT S2 = VT.getScalarType(); + if ((S2 == MVT::i32 && S1 == MVT::i8) || + (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) { + SDLoc DL(N); + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), + S2.getHalfSizedIntegerVT(*DAG.getContext()), + VT.getVectorElementCount()); + SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0); + SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1); + SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp); + } + return SDValue(); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Try to change sum of two reductions. - if (SDValue Val = performUADDVCombine(N, DAG)) + if (SDValue Val = performAddUADDVCombine(N, DAG)) return Val; if (SDValue Val = performAddDotCombine(N, DAG)) return Val; + if (SDValue Val = performAddCSelIntoCSinc(N, DAG)) + return Val; + if (SDValue Val = performNegCSelCombine(N, DAG)) + return Val; + if (SDValue Val = performVectorAddSubExtCombine(N, DAG)) + return Val; return performAddSubLongCombine(N, DCI, DAG); } @@ -15176,6 +16221,9 @@ static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) { return false; } + if (ISD::isConstantSplatVectorAllOnes(N.getNode())) + return true; + // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size // or smaller than the implicit element type represented by N. // NOTE: A larger element count implies a smaller element type. @@ -15186,8 +16234,7 @@ static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) { // If we're compiling for a specific vector-length, we can check if the // pattern's VL equals that of the scalable vector at runtime. if (N.getOpcode() == AArch64ISD::PTRUE) { - const auto &Subtarget = - static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); + const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); if (MaxSVESize && MinSVESize == MaxSVESize) { @@ -15233,6 +16280,39 @@ static SDValue performIntrinsicCombine(SDNode *N, switch (IID) { default: break; + case Intrinsic::get_active_lane_mask: { + SDValue Res = SDValue(); + EVT VT = N->getValueType(0); + if (VT.isFixedLengthVector()) { + // We can use the SVE whilelo instruction to lower this intrinsic by + // creating the appropriate sequence of scalable vector operations and + // then extracting a fixed-width subvector from the scalable vector. + + SDLoc DL(N); + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); + + EVT WhileVT = EVT::getVectorVT( + *DAG.getContext(), MVT::i1, + ElementCount::getScalable(VT.getVectorNumElements())); + + // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32. + EVT PromVT = getPromotedVTForPredicate(WhileVT); + + // Get the fixed-width equivalent of PromVT for extraction. + EVT ExtVT = + EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(), + VT.getVectorElementCount()); + + Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID, + N->getOperand(1), N->getOperand(2)); + Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res, + DAG.getConstant(0, DL, MVT::i64)); + Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res); + } + return Res; + } case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); @@ -15261,7 +16341,11 @@ static SDValue performIntrinsicCombine(SDNode *N, return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: + return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_umull: + return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_pmull: case Intrinsic::aarch64_neon_sqdmull: return tryCombineLongOpWithDup(IID, N, DCI, DAG); @@ -15350,6 +16434,10 @@ static SDValue performIntrinsicCombine(SDNode *N, return convertMergedOpToPredOp(N, ISD::XOR, DAG, true); case Intrinsic::aarch64_sve_orr: return convertMergedOpToPredOp(N, ISD::OR, DAG, true); + case Intrinsic::aarch64_sve_sabd: + return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true); + case Intrinsic::aarch64_sve_uabd: + return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true); case Intrinsic::aarch64_sve_sqadd: return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true); case Intrinsic::aarch64_sve_sqsub: @@ -15538,7 +16626,7 @@ static SDValue performExtendCombine(SDNode *N, static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts) { assert(!St.isTruncatingStore() && "cannot split truncating vector store"); - unsigned OrigAlignment = St.getAlignment(); + Align OrigAlignment = St.getAlign(); unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; // Create scalar stores. This is at least as good as the code sequence for a @@ -15563,7 +16651,7 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, unsigned Offset = EltOffset; while (--NumVecElts) { - unsigned Alignment = MinAlign(OrigAlignment, Offset); + Align Alignment = commonAlignment(OrigAlignment, Offset); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); @@ -15636,10 +16724,6 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); EVT PtrTy = N->getOperand(3).getValueType(); - if (VT == MVT::nxv8bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); - EVT LoadVT = VT; if (VT.isFloatingPoint()) LoadVT = VT.changeTypeToInteger(); @@ -15667,9 +16751,6 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { "Unsupported opcode."); SDLoc DL(N); EVT VT = N->getValueType(0); - if (VT == MVT::nxv8bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); EVT LoadVT = VT; if (VT.isFloatingPoint()) @@ -15692,10 +16773,6 @@ static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { EVT HwSrcVt = getSVEContainerType(DataVT); SDValue InputVT = DAG.getValueType(DataVT); - if (DataVT == MVT::nxv8bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); - if (DataVT.isFloatingPoint()) InputVT = DAG.getValueType(HwSrcVt); @@ -15722,10 +16799,6 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { EVT DataVT = Data.getValueType(); EVT PtrTy = N->getOperand(4).getValueType(); - if (DataVT == MVT::nxv8bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); - if (DataVT.isFloatingPoint()) Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); @@ -15912,8 +16985,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // extensions can use this to mark that it does not want splitting to happen // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of // eliminating alignment hazards is only 1 in 8 for alignment of 2. - if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || - S->getAlignment() <= 2) + if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) || + S->getAlign() <= Align(2)) return SDValue(); // If we get a splat of a scalar convert this vector store to a store of @@ -15934,11 +17007,11 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), - S->getAlignment(), S->getMemOperand()->getFlags()); + S->getAlign(), S->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, - S->getPointerInfo(), S->getAlignment(), + S->getPointerInfo(), S->getAlign(), S->getMemOperand()->getFlags()); } @@ -15970,6 +17043,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) { SDValue Op1 = N->getOperand(1); EVT ResVT = N->getValueType(0); + // uzp1(x, undef) -> concat(truncate(x), undef) + if (Op1.getOpcode() == ISD::UNDEF) { + EVT BCVT = MVT::Other, HalfVT = MVT::Other; + switch (ResVT.getSimpleVT().SimpleTy) { + default: + break; + case MVT::v16i8: + BCVT = MVT::v8i16; + HalfVT = MVT::v8i8; + break; + case MVT::v8i16: + BCVT = MVT::v4i32; + HalfVT = MVT::v4i16; + break; + case MVT::v4i32: + BCVT = MVT::v2i64; + HalfVT = MVT::v2i32; + break; + } + if (BCVT != MVT::Other) { + SDValue BC = DAG.getBitcast(BCVT, Op0); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc, + DAG.getUNDEF(HalfVT)); + } + } + // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { @@ -16267,6 +17367,152 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } +/// \return true if part of the index was folded into the Base. +static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, + SDLoc DL, SelectionDAG &DAG) { + // This function assumes a vector of i64 indices. + EVT IndexVT = Index.getValueType(); + if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64) + return false; + + // Simplify: + // BasePtr = Ptr + // Index = X + splat(Offset) + // -> + // BasePtr = Ptr + Offset * scale. + // Index = X + if (Index.getOpcode() == ISD::ADD) { + if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) { + Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); + BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); + Index = Index.getOperand(0); + return true; + } + } + + // Simplify: + // BasePtr = Ptr + // Index = (X + splat(Offset)) << splat(Shift) + // -> + // BasePtr = Ptr + (Offset << Shift) * scale) + // Index = X << splat(shift) + if (Index.getOpcode() == ISD::SHL && + Index.getOperand(0).getOpcode() == ISD::ADD) { + SDValue Add = Index.getOperand(0); + SDValue ShiftOp = Index.getOperand(1); + SDValue OffsetOp = Add.getOperand(1); + if (auto Shift = DAG.getSplatValue(ShiftOp)) + if (auto Offset = DAG.getSplatValue(OffsetOp)) { + Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift); + Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); + BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); + Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(), + Add.getOperand(0), ShiftOp); + return true; + } + } + + return false; +} + +// Analyse the specified address returning true if a more optimal addressing +// mode is available. When returning true all parameters are updated to reflect +// their recommended values. +static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, + SDValue &BasePtr, SDValue &Index, + SelectionDAG &DAG) { + // Try to iteratively fold parts of the index into the base pointer to + // simplify the index as much as possible. + bool Changed = false; + while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG)) + Changed = true; + + // Only consider element types that are pointer sized as smaller types can + // be easily promoted. + EVT IndexVT = Index.getValueType(); + if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64) + return Changed; + + // Match: + // Index = step(const) + int64_t Stride = 0; + if (Index.getOpcode() == ISD::STEP_VECTOR) + Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue(); + + // Match: + // Index = step(const) << shift(const) + else if (Index.getOpcode() == ISD::SHL && + Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) { + SDValue RHS = Index.getOperand(1); + if (auto *Shift = + dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) { + int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1); + Stride = Step << Shift->getZExtValue(); + } + } + + // Return early because no supported pattern is found. + if (Stride == 0) + return Changed; + + if (Stride < std::numeric_limits<int32_t>::min() || + Stride > std::numeric_limits<int32_t>::max()) + return Changed; + + const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); + unsigned MaxVScale = + Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock; + int64_t LastElementOffset = + IndexVT.getVectorMinNumElements() * Stride * MaxVScale; + + if (LastElementOffset < std::numeric_limits<int32_t>::min() || + LastElementOffset > std::numeric_limits<int32_t>::max()) + return Changed; + + EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); + // Stride does not scale explicitly by 'Scale', because it happens in + // the gather/scatter addressing mode. + Index = DAG.getNode(ISD::STEP_VECTOR, SDLoc(N), NewIndexVT, + DAG.getTargetConstant(Stride, SDLoc(N), MVT::i32)); + return true; +} + +static SDValue performMaskedGatherScatterCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N); + assert(MGS && "Can only combine gather load or scatter store nodes"); + + if (!DCI.isBeforeLegalize()) + return SDValue(); + + SDLoc DL(MGS); + SDValue Chain = MGS->getChain(); + SDValue Scale = MGS->getScale(); + SDValue Index = MGS->getIndex(); + SDValue Mask = MGS->getMask(); + SDValue BasePtr = MGS->getBasePtr(); + ISD::MemIndexType IndexType = MGS->getIndexType(); + + if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG)) + return SDValue(); + + // Here we catch such cases early and change MGATHER's IndexType to allow + // the use of an Index that's more legalisation friendly. + if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) { + SDValue PassThru = MGT->getPassThru(); + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather( + DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL, + Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType()); + } + auto *MSC = cast<MaskedScatterSDNode>(MGS); + SDValue Data = MSC->getValue(); + SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL, + Ops, MSC->getMemOperand(), IndexType, + MSC->isTruncatingStore()); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -16723,6 +17969,47 @@ static SDValue performBRCONDCombine(SDNode *N, return SDValue(); } +static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) { + unsigned CC = N->getConstantOperandVal(2); + SDValue SUBS = N->getOperand(3); + SDValue Zero, CTTZ; + + if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) { + Zero = N->getOperand(0); + CTTZ = N->getOperand(1); + } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) { + Zero = N->getOperand(1); + CTTZ = N->getOperand(0); + } else + return SDValue(); + + if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) || + (CTTZ.getOpcode() == ISD::TRUNCATE && + CTTZ.getOperand(0).getOpcode() != ISD::CTTZ)) + return SDValue(); + + assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) && + "Illegal type in CTTZ folding"); + + if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1))) + return SDValue(); + + SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE + ? CTTZ.getOperand(0).getOperand(0) + : CTTZ.getOperand(0); + + if (X != SUBS.getOperand(0)) + return SDValue(); + + unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE + ? CTTZ.getOperand(0).getValueSizeInBits() + : CTTZ.getValueSizeInBits(); + SDValue BitWidthMinusOne = + DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType()); + return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ, + BitWidthMinusOne); +} + // Optimize CSEL instructions static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, @@ -16731,6 +18018,11 @@ static SDValue performCSELCombine(SDNode *N, if (N->getOperand(0) == N->getOperand(1)) return N->getOperand(0); + // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1 + // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1 + if (SDValue Folded = foldCSELofCTTZ(N, DAG)) + return Folded; + return performCONDCombine(N, DCI, DAG, 2, 3); } @@ -16739,14 +18031,14 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); + SDLoc DL(N); + EVT VT = N->getValueType(0); // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X if (Cond == ISD::SETNE && isOneConstant(RHS) && LHS->getOpcode() == AArch64ISD::CSEL && isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) && LHS->hasOneUse()) { - SDLoc DL(N); - // Invert CSEL's condition. auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2)); auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue()); @@ -16757,9 +18049,48 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0), LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32), LHS.getOperand(3)); - return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0)); + return DAG.getZExtOrTrunc(CSEL, DL, VT); } + // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne + if (Cond == ISD::SETNE && isNullConstant(RHS) && + LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) && + LHS->hasOneUse()) { + EVT TstVT = LHS->getValueType(0); + if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) { + // this pattern will get better opt in emitComparison + uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1); + SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0), + DAG.getConstant(TstImm, DL, TstVT)); + return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2)); + } + } + + return SDValue(); +} + +// Replace a flag-setting operator (eg ANDS) with the generic version +// (eg AND) if the flag is unused. +static SDValue performFlagSettingCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + unsigned GenericOpcode) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + + // If the flag result isn't used, convert back to a generic opcode. + if (!N->hasAnyUseOfValue(1)) { + SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops()); + return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)}, + DL); + } + + // Combine identical generic nodes into this node, re-using the result. + if (SDNode *Generic = DCI.DAG.getNodeIfExists( + GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS})) + DCI.CombineTo(Generic, SDValue(N, 0)); + return SDValue(); } @@ -16801,27 +18132,46 @@ static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue +performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && "Unexpected opcode!"); + SelectionDAG &DAG = DCI.DAG; SDValue Pred = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get(); - // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne - // => inner setcc_merge_zero - if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && - LHS->getOpcode() == ISD::SIGN_EXTEND && - LHS->getOperand(0)->getValueType(0) == N->getValueType(0) && - LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && - LHS->getOperand(0)->getOperand(0) == Pred) - return LHS->getOperand(0); - if (SDValue V = performSetCCPunpkCombine(N, DAG)) return V; + if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && + LHS->getOpcode() == ISD::SIGN_EXTEND && + LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) { + // setcc_merge_zero( + // pred, extend(setcc_merge_zero(pred, ...)), != splat(0)) + // => setcc_merge_zero(pred, ...) + if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && + LHS->getOperand(0)->getOperand(0) == Pred) + return LHS->getOperand(0); + + // setcc_merge_zero( + // all_active, extend(nxvNi1 ...), != splat(0)) + // -> nxvNi1 ... + if (isAllActivePredicate(DAG, Pred)) + return LHS->getOperand(0); + + // setcc_merge_zero( + // pred, extend(nxvNi1 ...), != splat(0)) + // -> nxvNi1 and(pred, ...) + if (DCI.isAfterLegalizeDAG()) + // Do this after legalization to allow more folds on setcc_merge_zero + // to be recognized. + return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), + LHS->getOperand(0), Pred); + } + return SDValue(); } @@ -16928,12 +18278,53 @@ static SDValue performTBZCombine(SDNode *N, DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); } +// Swap vselect operands where it may allow a predicated operation to achieve +// the `sel`. +// +// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b))) +// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a)) +static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) { + auto SelectA = N->getOperand(1); + auto SelectB = N->getOperand(2); + auto NTy = N->getValueType(0); + + if (!NTy.isScalableVector()) + return SDValue(); + SDValue SetCC = N->getOperand(0); + if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse()) + return SDValue(); + + switch (SelectB.getOpcode()) { + default: + return SDValue(); + case ISD::FMUL: + case ISD::FSUB: + case ISD::FADD: + break; + } + if (SelectA != SelectB.getOperand(0)) + return SDValue(); + + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); + ISD::CondCode InverseCC = + ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType()); + auto InverseSetCC = + DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0), + SetCC.getOperand(1), InverseCC); + + return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy, + {InverseSetCC, SelectB, SelectA}); +} + // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine // such VSELECT. static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { + if (auto SwapResult = trySwapVSelectOperands(N, DAG)) + return SwapResult; + SDValue N0 = N->getOperand(0); EVT CCVT = N0.getValueType(); @@ -17064,6 +18455,24 @@ static SDValue performSelectCombine(SDNode *N, return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } +static SDValue performDUPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the + // 128bit vector version. + if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { + EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); + if (SDNode *LN = DCI.DAG.getNodeIfExists( + N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) { + SDLoc DL(N); + return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), + DCI.DAG.getConstant(0, DL, MVT::i64)); + } + } + + return performPostLD1Combine(N, DCI, false); +} + /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) @@ -17104,13 +18513,14 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, // Check whether folding this offset is legal. It must not go out of bounds of // the referenced object to avoid violating the code model, and must be - // smaller than 2^21 because this is the largest offset expressible in all - // object formats. + // smaller than 2^20 because this is the largest offset expressible in all + // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF + // stores an immediate signed 21 bit offset.) // // This check also prevents us from folding negative offsets, which will end // up being treated in the same way as large positive ones. They could also // cause code model violations, and aren't really common enough to matter. - if (Offset >= (1 << 21)) + if (Offset >= (1 << 20)) return SDValue(); const GlobalValue *GV = GN->getGlobal(); @@ -17621,7 +19031,7 @@ performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return performPostLD1Combine(N, DCI, true); } -SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { EVT Ty = N->getValueType(0); if (Ty.isInteger()) return SDValue(); @@ -17643,9 +19053,9 @@ SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getBitcast(Ty, Trunc); } -SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const AArch64Subtarget *Subtarget) { +static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -17675,6 +19085,31 @@ SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget, + bool fixedSVEVectorVT) { + EVT VT = N->getValueType(0); + + // Don't expand for SVE2 + if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME()) + return SDValue(); + + // Don't expand for NEON + if (VT.isFixedLengthVector() && !fixedSVEVectorVT) + return SDValue(); + + SDLoc DL(N); + + SDValue Mask = N->getOperand(0); + SDValue In1 = N->getOperand(1); + SDValue In2 = N->getOperand(2); + + SDValue InvMask = DAG.getNOT(DL, Mask, VT); + SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1); + SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2); + return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17685,6 +19120,22 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ADD: case ISD::SUB: return performAddSubCombine(N, DCI, DAG); + case AArch64ISD::ANDS: + return performFlagSettingCombine(N, DCI, ISD::AND); + case AArch64ISD::ADC: + if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) + return R; + return foldADCToCINC(N, DAG); + case AArch64ISD::SBC: + return foldOverflowCheck(N, DAG, /* IsAdd */ false); + case AArch64ISD::ADCS: + if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) + return R; + return performFlagSettingCombine(N, DCI, AArch64ISD::ADC); + case AArch64ISD::SBCS: + if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false)) + return R; + return performFlagSettingCombine(N, DCI, AArch64ISD::SBC); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: @@ -17711,10 +19162,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performExtendCombine(N, DCI, DAG); case ISD::SIGN_EXTEND_INREG: return performSignExtendInRegCombine(N, DCI, DAG); - case ISD::TRUNCATE: - return performVectorTruncateCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); + case ISD::EXTRACT_SUBVECTOR: + return performExtractSubvectorCombine(N, DCI, DAG); case ISD::INSERT_SUBVECTOR: return performInsertSubvectorCombine(N, DCI, DAG); case ISD::SELECT: @@ -17729,6 +19180,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); + case ISD::MGATHER: + case ISD::MSCATTER: + return performMaskedGatherScatterCombine(N, DCI, DAG); case ISD::VECTOR_SPLICE: return performSVESpliceCombine(N, DAG); case ISD::FP_EXTEND: @@ -17741,7 +19195,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: - return performPostLD1Combine(N, DCI, false); + return performDUPCombine(N, DCI); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: @@ -17752,7 +19206,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case AArch64ISD::UZP1: return performUzpCombine(N, DAG); case AArch64ISD::SETCC_MERGE_ZERO: - return performSetccMergeZeroCombine(N, DAG); + return performSetccMergeZeroCombine(N, DCI); case AArch64ISD::GLD1_MERGE_ZERO: case AArch64ISD::GLD1_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_MERGE_ZERO: @@ -17773,12 +19227,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performVectorShiftCombine(N, *this, DCI); case AArch64ISD::SUNPKLO: return performSunpkloCombine(N, DAG); + case AArch64ISD::BSP: + return performBSPExpandForSVE( + N, DAG, Subtarget, useSVEForFixedLengthVectorVT(N->getValueType(0))); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: - return performExtractVectorEltCombine(N, DAG); + return performExtractVectorEltCombine(N, DCI, Subtarget); case ISD::VECREDUCE_ADD: return performVecReduceAddCombine(N, DCI.DAG, Subtarget); + case AArch64ISD::UADDV: + return performUADDVCombine(N, DAG); + case AArch64ISD::SMULL: + case AArch64ISD::UMULL: + return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -18152,6 +19614,15 @@ void AArch64TargetLowering::ReplaceBITCASTResults( if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) { assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() && "Expected fp->int bitcast!"); + + // Bitcasting between unpacked vector types of different element counts is + // not a NOP because the live elements are laid out differently. + // 01234567 + // e.g. nxv2i32 = XX??XX?? + // nxv4f16 = X?X?X?X? + if (VT.getVectorElementCount() != SrcVT.getVectorElementCount()) + return; + SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult)); return; @@ -18169,6 +19640,53 @@ void AArch64TargetLowering::ReplaceBITCASTResults( Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } +static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.is256BitVector() || + (VT.getScalarType().isFloatingPoint() && + !N->getFlags().hasAllowReassociation()) || + (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16())) + return; + + SDValue X = N->getOperand(0); + auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1)); + if (!Shuf) { + Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0)); + X = N->getOperand(1); + if (!Shuf) + return; + } + + if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef()) + return; + + // Check the mask is 1,0,3,2,5,4,... + ArrayRef<int> Mask = Shuf->getMask(); + for (int I = 0, E = Mask.size(); I < E; I++) + if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1)) + return; + + SDLoc DL(N); + auto LoHi = DAG.SplitVector(X, DL); + assert(LoHi.first.getValueType() == LoHi.second.getValueType()); + SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(), + LoHi.first, LoHi.second); + + // Shuffle the elements back into order. + SmallVector<int> NMask; + for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) { + NMask.push_back(I); + NMask.push_back(I); + } + Results.push_back( + DAG.getVectorShuffle(VT, DL, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp, + DAG.getUNDEF(LoHi.first.getValueType())), + DAG.getUNDEF(VT), NMask)); +} + static void ReplaceReductionResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG, unsigned InterOp, @@ -18346,6 +19864,10 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::ADD: + case ISD::FADD: + ReplaceAddWithADDP(N, Results, DAG, Subtarget); + return; case ISD::CTPOP: if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) @@ -18406,8 +19928,10 @@ void AArch64TargetLowering::ReplaceNodeResults( ReplaceExtractSubVectorResults(N, Results, DAG); return; case ISD::INSERT_SUBVECTOR: - // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate - // to common code for result type legalisation + case ISD::CONCAT_VECTORS: + // Custom lowering has been requested for INSERT_SUBVECTOR and + // CONCAT_VECTORS -- but delegate to common code for result type + // legalisation return; case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); @@ -18485,11 +20009,11 @@ bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const { if (auto LI = dyn_cast<LoadInst>(I)) return LI->getType()->getPrimitiveSizeInBits() == 128 && - LI->getAlignment() >= 16; + LI->getAlign() >= Align(16); if (auto SI = dyn_cast<StoreInst>(I)) return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && - SI->getAlignment() >= 16; + SI->getAlign() >= Align(16); return false; } @@ -18502,12 +20026,12 @@ bool AArch64TargetLowering::shouldInsertFencesForAtomic( // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. -bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { +TargetLoweringBase::AtomicExpansionKind +AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); - if (Size != 128) - return false; - - return !isOpSuitableForLDPSTP(SI); + if (Size != 128 || isOpSuitableForLDPSTP(SI)) + return AtomicExpansionKind::None; + return AtomicExpansionKind::Expand; } // Loads and stores less than 128-bits are already atomic; ones above that @@ -18627,7 +20151,10 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); - Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy); + CallInst *CI = Builder.CreateCall(Ldxr, Addr); + CI->addParamAttr( + 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy)); + Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); return Builder.CreateBitCast(Trunc, ValueTy); } @@ -18668,10 +20195,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); Val = Builder.CreateBitCast(Val, IntValTy); - return Builder.CreateCall(Stxr, - {Builder.CreateZExtOrBitCast( - Val, Stxr->getFunctionType()->getParamType(0)), - Addr}); + CallInst *CI = Builder.CreateCall( + Stxr, {Builder.CreateZExtOrBitCast( + Val, Stxr->getFunctionType()->getParamType(0)), + Addr}); + CI->addParamAttr(1, Attribute::get(Builder.getContext(), + Attribute::ElementType, Val->getType())); + return CI; } bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( @@ -18993,8 +20523,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use // AArch64SVEPredPattern::all, which can enable the use of unpredicated // variants of instructions when available. - const auto &Subtarget = - static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); + const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); if (MaxSVESize && MinSVESize == MaxSVESize && @@ -19080,22 +20609,23 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( MemVT = MemVT.changeTypeToInteger(); } - auto NewLoad = DAG.getMaskedLoad( + SDValue NewLoad = DAG.getMaskedLoad( LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg, DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(), Load->getAddressingMode(), Load->getExtensionType()); + SDValue Result = NewLoad; if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { EVT ExtendVT = ContainerVT.changeVectorElementType( Load->getMemoryVT().getVectorElementType()); - NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG); - NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, - Pg, NewLoad, DAG.getUNDEF(ContainerVT)); + Result = getSVESafeBitCast(ExtendVT, Result, DAG); + Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, + Pg, Result, DAG.getUNDEF(ContainerVT)); } - auto Result = convertFromScalableVector(DAG, VT, NewLoad); - SDValue MergedValues[2] = {Result, Load->getChain()}; + Result = convertFromScalableVector(DAG, VT, Result); + SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; return DAG.getMergeValues(MergedValues, DL); } @@ -19143,19 +20673,20 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE( IsPassThruZeroOrUndef = true; } - auto NewLoad = DAG.getMaskedLoad( + SDValue NewLoad = DAG.getMaskedLoad( ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(), Load->getExtensionType()); + SDValue Result = NewLoad; if (!IsPassThruZeroOrUndef) { SDValue OldPassThru = convertToScalableVector(DAG, ContainerVT, Load->getPassThru()); - NewLoad = DAG.getSelect(DL, ContainerVT, Mask, NewLoad, OldPassThru); + Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru); } - auto Result = convertFromScalableVector(DAG, VT, NewLoad); - SDValue MergedValues[2] = {Result, Load->getChain()}; + Result = convertFromScalableVector(DAG, VT, Result); + SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; return DAG.getMergeValues(MergedValues, DL); } @@ -19232,7 +20763,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( // Scalable vector i32/i64 DIV is supported. if (EltVT == MVT::i32 || EltVT == MVT::i64) - return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, PredOpcode); // Scalable vector i8/i16 DIV is not supported. Promote it to i32. EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); @@ -19387,13 +20918,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt( // NOTE: The results for inactive lanes are undefined. SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, - unsigned NewOp, - bool OverrideNEON) const { + unsigned NewOp) const { EVT VT = Op.getValueType(); SDLoc DL(Op); auto Pg = getPredicateForVector(DAG, DL, VT); - if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) { + if (VT.isFixedLengthVector()) { + assert(isTypeLegal(VT) && "Expected only legal fixed-width types"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); // Create list of operands by converting existing ones to scalable types. @@ -19411,8 +20942,8 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, continue; } - assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) && - "Only fixed length vectors are supported!"); + assert(isTypeLegal(V.getValueType()) && + "Expected only legal fixed-width types"); Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); } @@ -19543,7 +21074,9 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, SDValue VecOp = ScalarOp.getOperand(0); EVT SrcVT = VecOp.getValueType(); - if (useSVEForFixedLengthVectorVT(SrcVT, true)) { + if (useSVEForFixedLengthVectorVT( + SrcVT, + /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); } @@ -19950,6 +21483,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); + // Safe bitcasting between unpacked vector types of different element counts + // is currently unsupported because the following is missing the necessary + // work to ensure the result's elements live where they're supposed to within + // an SVE register. + // 01234567 + // e.g. nxv2i32 = XX??XX?? + // nxv4f16 = X?X?X?X? + assert((VT.getVectorElementCount() == InVT.getVectorElementCount() || + VT == PackedVT || InVT == PackedInVT) && + "Unexpected bitcast!"); + // Pack input if required. if (InVT != PackedInVT) Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); @@ -20016,6 +21560,13 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } +bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const { + return Op.getOpcode() == AArch64ISD::DUP || + (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Op.getOperand(0).getOpcode() == AArch64ISD::DUP) || + TargetLowering::isTargetCanonicalConstantNode(Op); +} + bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal( unsigned Opc, LLT Ty1, LLT Ty2) const { return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2138c0ffe70a..06ea918ea32e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -55,6 +55,8 @@ enum NodeType : unsigned { // x29, x29` marker instruction. CALL_RVMARKER, + CALL_BTI, // Function call followed by a BTI instruction. + // Produces the full sequence of instructions for getting the thread pointer // offset of a variable into X0, using the TLSDesc model. TLSDESC_CALLSEQ, @@ -79,7 +81,6 @@ enum NodeType : unsigned { // Predicated instructions where inactive lanes produce undefined results. ABDS_PRED, ABDU_PRED, - ADD_PRED, FADD_PRED, FDIV_PRED, FMA_PRED, @@ -98,7 +99,6 @@ enum NodeType : unsigned { SMIN_PRED, SRA_PRED, SRL_PRED, - SUB_PRED, UDIV_PRED, UMAX_PRED, UMIN_PRED, @@ -158,6 +158,7 @@ enum NodeType : unsigned { DUPLANE16, DUPLANE32, DUPLANE64, + DUPLANE128, // Vector immedate moves MOVI, @@ -232,15 +233,10 @@ enum NodeType : unsigned { SADDV, UADDV, - // Vector halving addition - SHADD, - UHADD, - - // Vector rounding halving addition - SRHADD, - URHADD, - - // Unsigned Add Long Pairwise + // Add Pairwise of two vectors + ADDP, + // Add Long Pairwise + SADDLP, UADDLP, // udot/sdot instructions @@ -411,6 +407,10 @@ enum NodeType : unsigned { SSTNT1_PRED, SSTNT1_INDEX_PRED, + // SME + RDSVL, + REVD_MERGE_PASSTHRU, + // Asserts that a function argument (i32) is zero-extended to i8 by // the caller ASSERT_ZEXT_BOOL, @@ -462,23 +462,6 @@ enum NodeType : unsigned { } // end namespace AArch64ISD -namespace { - -// Any instruction that defines a 32-bit result zeros out the high half of the -// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. But any other 32-bit operation will zero-extend -// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper -// 32 bits, they're probably just qualifying a CopyFromReg. -static inline bool isDef32(const SDNode &N) { - unsigned Opc = N.getOpcode(); - return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && - Opc != ISD::CopyFromReg && Opc != ISD::AssertSext && - Opc != ISD::AssertZext && Opc != ISD::AssertAlign && - Opc != ISD::FREEZE; -} - -} // end anonymous namespace - namespace AArch64 { /// Possible values of current rounding mode, which is specified in bits /// 23:22 of FPCR. @@ -501,6 +484,11 @@ public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); + /// Control the following reassociation of operands: (op (op x, c1), y) -> (op + /// (op x, y), c1) where N0 is (op x, c1) and N1 is y. + bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const override; + /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; @@ -573,6 +561,17 @@ public: MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitMopa(unsigned Opc, unsigned BaseReg, MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -610,8 +609,8 @@ public: bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; - bool isMulAddWithConstProfitable(const SDValue &AddNode, - const SDValue &ConstNode) const override; + bool isMulAddWithConstProfitable(SDValue AddNode, + SDValue ConstNode) const override; bool shouldConsiderGEPOffsetSplit() const override; @@ -651,6 +650,10 @@ public: bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; + /// Return true if it is profitable to fold a pair of shifts into a mask. + bool shouldFoldConstantShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; + /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, @@ -680,7 +683,8 @@ public: TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; - bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; @@ -898,11 +902,8 @@ private: SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - bool isEligibleForTailCallOptimization( - SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const; + bool + isEligibleForTailCallOptimization(const CallLoweringInfo &CLI) const; /// Finds the incoming stack arguments which overlap the given fixed stack /// object and incorporates their load into the current chain. This prevents @@ -980,8 +981,8 @@ private: SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, - bool OverrideNEON = false) const; + SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, + unsigned NewOp) const; SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; @@ -1052,6 +1053,8 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl<SDNode *> &Created) const override; + SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const override; SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override; @@ -1093,7 +1096,7 @@ private: } bool shouldExtendGSIndex(EVT VT, EVT &EltTy) const override; - bool shouldRemoveExtendFromGSIndex(EVT VT) const override; + bool shouldRemoveExtendFromGSIndex(EVT IndexVT, EVT DataVT) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; @@ -1129,6 +1132,8 @@ private: TargetLoweringOpt &TLO, unsigned Depth) const override; + bool isTargetCanonicalConstantNode(SDValue Op) const override; + // Normally SVE is only used for byte size vectors that do not fit within a // NEON vector. This changes when OverrideNEON is true, allowing SVE to be // used for 64bit and 128bit vectors as well. diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index b220929514f9..c477a44b13b2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -27,22 +27,43 @@ def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>; // supported, but when they're relaxed and anything can be used, all the // standard modes would be valid and may give efficiency gains. +// An atomic load operation that does not need either acquire or release +// semantics. +class relaxed_load<PatFrag base> + : PatFrag<(ops node:$ptr), (base node:$ptr)> { + let IsAtomic = 1; + let IsAtomicOrderingAcquireOrStronger = 0; +} + // A atomic load operation that actually needs acquire semantics. class acquiring_load<PatFrag base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; - let IsAtomicOrderingAcquireOrStronger = 1; + let IsAtomicOrderingAcquire = 1; } -// An atomic load operation that does not need either acquire or release -// semantics. -class relaxed_load<PatFrag base> +// An atomic load operation that needs sequential consistency. +class seq_cst_load<PatFrag base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; - let IsAtomicOrderingAcquireOrStronger = 0; + let IsAtomicOrderingSequentiallyConsistent = 1; +} + +// RCPC extension, currently opt-in under a separate feature. +let Predicates = [HasLDAPR] in { + // v8.3 Release Consistent Processor Consistent support, optional in v8.2. + // 8-bit loads + def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDAPRB GPR64sp:$ptr)>; + // 16-bit loads + def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDAPRH GPR64sp:$ptr)>; + // 32-bit loads + def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDAPRW GPR64sp:$ptr)>; + // 64-bit loads + def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDAPRX GPR64sp:$ptr)>; } // 8-bit loads +def : Pat<(seq_cst_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)), @@ -58,6 +79,7 @@ def : Pat<(relaxed_load<atomic_load_8> (LDURBBi GPR64sp:$Rn, simm9:$offset)>; // 16-bit loads +def : Pat<(seq_cst_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)), @@ -73,6 +95,7 @@ def : Pat<(relaxed_load<atomic_load_16> (LDURHHi GPR64sp:$Rn, simm9:$offset)>; // 32-bit loads +def : Pat<(seq_cst_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>; def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>; def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)), @@ -88,6 +111,7 @@ def : Pat<(relaxed_load<atomic_load_32> (LDURWi GPR64sp:$Rn, simm9:$offset)>; // 64-bit loads +def : Pat<(seq_cst_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>; def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>; def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)), @@ -490,7 +514,8 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch), let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch", mayLoad = 1, mayStore = 1 in { -class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch), +class cmp_swap_128 : Pseudo<(outs GPR64common:$RdLo, GPR64common:$RdHi, + GPR32common:$scratch), (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, GPR64:$newLo, GPR64:$newHi), []>, Sched<[WriteAtomic]>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 4c1e41b7efee..78bc1b8c6f02 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -109,15 +109,19 @@ class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag<dag res> : PatFrag<(ops node:$LHS), res>; -// Helper fragment for an extract of the high portion of a 128-bit vector. +// Helper fragment for an extract of the high portion of a 128-bit vector. The +// ComplexPattern match both extract_subvector and bitcast(extract_subvector(..)). def extract_high_v16i8 : - UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>; + ComplexPattern<v8i8, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>; def extract_high_v8i16 : - UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>; + ComplexPattern<v4i16, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>; def extract_high_v4i32 : - UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>; -def extract_high_v2i64 : - UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>; + ComplexPattern<v2i32, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>; + +def extract_high_dup_v8i16 : + BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>; +def extract_high_dup_v4i32 : + BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>; //===----------------------------------------------------------------------===// // Asm Operand Classes. @@ -1178,6 +1182,13 @@ def fpimm32XForm : SDNodeXForm<fpimm, [{ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>; +def fpimm32SIMDModImmType4XForm : SDNodeXForm<fpimm, [{ + uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType4(N->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>; + def fpimm64XForm : SDNodeXForm<fpimm, [{ APFloat InVal = N->getValueAPF(); uint32_t enc = AArch64_AM::getFP64Imm(InVal); @@ -1199,6 +1210,13 @@ def fpimm32 : Operand<f32>, let ParserMatchClass = FPImmOperand; let PrintMethod = "printFPImmOperand"; } + +def fpimm32SIMDModImmType4 : FPImmLeaf<f32, [{ + uint64_t Enc = Imm.bitcastToAPInt().getZExtValue(); + return Enc != 0 && AArch64_AM::isAdvSIMDModImmType4(Enc << 32 | Enc); + }], fpimm32SIMDModImmType4XForm> { +} + def fpimm64 : Operand<f64>, FPImmLeaf<f64, [{ return AArch64_AM::getFP64Imm(Imm) != -1; @@ -1234,6 +1252,9 @@ def gi_fpimm32 : GICustomOperandRenderer<"renderFPImm32">, GISDNodeXFormEquiv<fpimm32XForm>; def gi_fpimm64 : GICustomOperandRenderer<"renderFPImm64">, GISDNodeXFormEquiv<fpimm64XForm>; +def gi_fpimm32SIMDModImmType4 : + GICustomOperandRenderer<"renderFPImm32SIMDModImmType4">, + GISDNodeXFormEquiv<fpimm32SIMDModImmType4XForm>; // Vector lane operands class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass { @@ -1261,8 +1282,12 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>; def VectorIndexSOperand : AsmVectorIndex<0, 3>; def VectorIndexDOperand : AsmVectorIndex<0, 1>; -defm VectorIndex0 : VectorIndex<i64, VectorIndex0Operand, +let OperandNamespace = "AArch64" in { + let OperandType = "OPERAND_IMPLICIT_IMM_0" in { + defm VectorIndex0 : VectorIndex<i64, VectorIndex0Operand, [{ return ((uint64_t)Imm) == 0; }]>; + } +} defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand, @@ -1312,6 +1337,8 @@ def sme_elm_idx0_0 : Operand<i64>, ImmLeaf<i64, [{ }]> { let ParserMatchClass = Imm0_0Operand; let PrintMethod = "printMatrixIndex"; + let OperandNamespace = "AArch64"; + let OperandType = "OPERAND_IMPLICIT_IMM_0"; } def sme_elm_idx0_1 : Operand<i64>, ImmLeaf<i64, [{ return ((uint64_t)Imm) <= 1; @@ -4512,8 +4539,9 @@ multiclass MemTagStore<bits<2> opc1, string insn> { //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in -class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm> - : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", []>, +class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm, + list<dag> pattern = []> + : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", pattern>, Sched<[WriteSys]> { bits<16> imm; let Inst{31-24} = 0b11010100; @@ -4542,6 +4570,7 @@ let Predicates = [HasFPARMv8] in { // Floating point to integer conversion //--- +let mayRaiseFPException = 1 in class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode, RegisterClass srcType, RegisterClass dstType, string asm, list<dag> pattern> @@ -4561,7 +4590,7 @@ class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode, let Inst{4-0} = Rd; } -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode, RegisterClass srcType, RegisterClass dstType, Operand immType, string asm, list<dag> pattern> @@ -4683,7 +4712,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm, // Integer to floating point conversion //--- -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +let mayStore = 0, mayLoad = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseIntegerToFP<bit isUnsigned, RegisterClass srcType, RegisterClass dstType, Operand immType, string asm, list<dag> pattern> @@ -4701,6 +4730,7 @@ class BaseIntegerToFP<bit isUnsigned, let Inst{4-0} = Rd; } +let mayRaiseFPException = 1 in class BaseIntegerToFPUnscaled<bit isUnsigned, RegisterClass srcType, RegisterClass dstType, ValueType dvt, string asm, SDPatternOperator node> @@ -4937,6 +4967,7 @@ multiclass UnscaledConversion<string asm> { // Floating point conversion //--- +let mayRaiseFPException = 1 in class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType, RegisterClass srcType, string asm, list<dag> pattern> : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>, @@ -4963,15 +4994,15 @@ multiclass FPConversion<string asm> { // Half-precision to Double-precision def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, - [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>; + [(set FPR64:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>; // Half-precision to Single-precision def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, - [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>; + [(set FPR32:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>; // Single-precision to Double-precision def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, - [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>; + [(set FPR64:$Rd, (any_fpextend FPR32:$Rn))]>; // Single-precision to Half-precision def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, @@ -4999,8 +5030,9 @@ class BaseSingleOperandFPData<bits<6> opcode, RegisterClass regtype, } multiclass SingleOperandFPData<bits<4> opcode, string asm, - SDPatternOperator node = null_frag> { - + SDPatternOperator node = null_frag, + int fpexceptions = 1> { + let mayRaiseFPException = fpexceptions in { def Hr : BaseSingleOperandFPData<{0b00,opcode}, FPR16, f16, asm, node> { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; @@ -5013,8 +5045,14 @@ multiclass SingleOperandFPData<bits<4> opcode, string asm, def Dr : BaseSingleOperandFPData<{0b00,opcode}, FPR64, f64, asm, node> { let Inst{23-22} = 0b01; // 64-bit size flag } + } } +multiclass SingleOperandFPDataNoException<bits<4> opcode, string asm, + SDPatternOperator node = null_frag> + : SingleOperandFPData<opcode, asm, node, 0>; + +let mayRaiseFPException = 1 in multiclass SingleOperandFPNo16<bits<6> opcode, string asm, SDPatternOperator node = null_frag>{ @@ -5035,7 +5073,7 @@ multiclass FRIntNNT<bits<2> opcode, string asm, SDPatternOperator node = null_fr // Two operand floating point data processing //--- -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, string asm, list<dag> pat> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), @@ -5075,7 +5113,8 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm, } } -multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> { +multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, + SDPatternOperator node> { def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> { let Inst{23-22} = 0b11; // 16-bit size flag @@ -5098,6 +5137,7 @@ multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> { // Three operand floating point data processing //--- +let mayRaiseFPException = 1 in class BaseThreeOperandFPData<bit isNegated, bit isSub, RegisterClass regtype, string asm, list<dag> pat> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra), @@ -5142,7 +5182,7 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm, // Floating point data comparisons //--- -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseOneOperandFPComparison<bit signalAllNans, RegisterClass regtype, string asm, list<dag> pat> @@ -5161,7 +5201,7 @@ class BaseOneOperandFPComparison<bit signalAllNans, let PostEncoderMethod = "fixOneOperandFPComparison"; } -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, string asm, list<dag> pat> : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>, @@ -5218,7 +5258,7 @@ multiclass FPComparison<bit signalAllNans, string asm, // Floating point conditional comparisons //--- -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype, string mnemonic, list<dag> pat> : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), @@ -5544,6 +5584,7 @@ multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm, } // As above, but only floating point elements supported. +let mayRaiseFPException = 1 in multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { @@ -5565,6 +5606,7 @@ multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc, [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } +let mayRaiseFPException = 1 in multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { @@ -5587,6 +5629,7 @@ multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc, [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } +let mayRaiseFPException = 1 in multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { @@ -5614,6 +5657,7 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc, } // As above, but D and B sized elements unsupported. +let mayRaiseFPException = 1 in multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, @@ -5718,6 +5762,7 @@ multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperato // ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions // select inputs from 4H vectors and accumulate outputs to a 2S vector (or from // 8H to 4S, when Q=1). +let mayRaiseFPException = 1 in class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1, string kind2, RegisterOperand RegType, ValueType AccumType, ValueType InputType, @@ -5986,7 +6031,9 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, // Supports H, S and D element sizes, uses high bit of the size field // as an extra opcode bit. multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, - SDPatternOperator OpNode> { + SDPatternOperator OpNode, + int fpexceptions = 1> { + let mayRaiseFPException = fpexceptions in { let Predicates = [HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, asm, ".4h", ".4h", @@ -6004,9 +6051,15 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; + } } +multiclass SIMDTwoVectorFPNoException<bit U, bit S, bits<5> opc, string asm, + SDPatternOperator OpNode> + : SIMDTwoVectorFP<U, S, opc, asm, OpNode, 0>; + // Supports only S and D element sizes +let mayRaiseFPException = 1 in multiclass SIMDTwoVectorSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { @@ -6036,7 +6089,7 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } - +let mayRaiseFPException = 1 in multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { @@ -6058,6 +6111,7 @@ multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm, [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } +let mayRaiseFPException = 1 in multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { @@ -6209,6 +6263,7 @@ multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm, multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc, string asm, SDNode OpNode> { + let mayRaiseFPException = 1 in { let Predicates = [HasNEON, HasFullFP16] in { def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64, asm, ".4h", "0.0", @@ -6226,6 +6281,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc, def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128, asm, ".2d", "0.0", v2i64, v2f64, OpNode>; + } let Predicates = [HasNEON, HasFullFP16] in { def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0", @@ -6253,7 +6309,7 @@ multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc, (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; } -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, RegisterOperand outtype, RegisterOperand intype, string asm, string VdTy, string VnTy, @@ -6275,7 +6331,7 @@ class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{4-0} = Rd; } -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, RegisterOperand outtype, RegisterOperand intype, string asm, string VdTy, string VnTy, @@ -6457,8 +6513,8 @@ multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, asm#"2", ".1q", ".2d", ".2d", []>; } - def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)), - (v8i8 (extract_high_v16i8 V128:$Rm)))), + def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), + (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))), (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>; } @@ -6471,8 +6527,8 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc, V128, V128, V128, asm#"2", ".4s", ".8h", ".8h", - [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm)))]>; + [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, V128, V64, V64, asm, ".2d", ".2s", ".2s", @@ -6480,8 +6536,8 @@ multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc, V128, V128, V128, asm#"2", ".2d", ".4s", ".4s", - [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm)))]>; + [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, @@ -6495,8 +6551,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, V128, V128, V128, asm#"2", ".8h", ".16b", ".16b", [(set (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm)))))]>; + (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm))))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, V128, V64, V64, asm, ".4s", ".4h", ".4h", @@ -6506,8 +6562,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, V128, V128, V128, asm#"2", ".4s", ".8h", ".8h", [(set (v4i32 V128:$Rd), - (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm)))))]>; + (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, V128, V64, V64, asm, ".2d", ".2s", ".2s", @@ -6517,8 +6573,8 @@ multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, V128, V128, V128, asm#"2", ".2d", ".4s", ".4s", [(set (v2i64 V128:$Rd), - (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm)))))]>; + (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))))]>; } multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, @@ -6535,8 +6591,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, asm#"2", ".8h", ".16b", ".16b", [(set (v8i16 V128:$dst), (add (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm))))))]>; + (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm)))))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc, V128, V64, V64, asm, ".4s", ".4h", ".4h", @@ -6548,8 +6604,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, asm#"2", ".4s", ".8h", ".8h", [(set (v4i32 V128:$dst), (add (v4i32 V128:$Rd), - (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm))))))]>; + (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm)))))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc, V128, V64, V64, asm, ".2d", ".2s", ".2s", @@ -6561,8 +6617,8 @@ multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, asm#"2", ".2d", ".4s", ".4s", [(set (v2i64 V128:$dst), (add (v2i64 V128:$Rd), - (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm))))))]>; + (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>; } multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, @@ -6574,8 +6630,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc, V128, V128, V128, asm#"2", ".8h", ".16b", ".16b", - [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm)))]>; + [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, V128, V64, V64, asm, ".4s", ".4h", ".4h", @@ -6583,8 +6639,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc, V128, V128, V128, asm#"2", ".4s", ".8h", ".8h", - [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm)))]>; + [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, V128, V64, V64, asm, ".2d", ".2s", ".2s", @@ -6592,8 +6648,8 @@ multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc, V128, V128, V128, asm#"2", ".2d", ".4s", ".4s", - [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm)))]>; + [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc, @@ -6609,8 +6665,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc, asm#"2", ".8h", ".16b", ".16b", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), - (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm)))]>; + (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc, V128, V64, V64, asm, ".4s", ".4h", ".4h", @@ -6621,8 +6677,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc, asm#"2", ".4s", ".8h", ".8h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), - (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm)))]>; + (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc, V128, V64, V64, asm, ".2d", ".2s", ".2s", @@ -6633,8 +6689,8 @@ multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc, asm#"2", ".2d", ".4s", ".4s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), - (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm)))]>; + (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm, @@ -6651,8 +6707,8 @@ multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm, asm#"2", ".4s", ".8h", ".8h", [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm)))))]>; + (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc, V128, V64, V64, asm, ".2d", ".2s", ".2s", @@ -6665,8 +6721,8 @@ multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm, asm#"2", ".2d", ".4s", ".4s", [(set (v2i64 V128:$dst), (Accum (v2i64 V128:$Rd), - (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm)))))]>; + (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))))]>; } multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm, @@ -6679,7 +6735,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm, V128, V128, V128, asm#"2", ".8h", ".8h", ".16b", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), - (extract_high_v16i8 V128:$Rm)))]>; + (extract_high_v16i8 (v16i8 V128:$Rm))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, V128, V128, V64, asm, ".4s", ".4s", ".4h", @@ -6688,7 +6744,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm, V128, V128, V128, asm#"2", ".4s", ".4s", ".8h", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), - (extract_high_v8i16 V128:$Rm)))]>; + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, V128, V128, V64, asm, ".2d", ".2d", ".2s", @@ -6697,7 +6753,7 @@ multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm, V128, V128, V128, asm#"2", ".2d", ".2d", ".4s", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), - (extract_high_v4i32 V128:$Rm)))]>; + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } //---------------------------------------------------------------------------- @@ -6876,7 +6932,7 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm> { multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag, Predicate pred = HasNEON> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in { let Predicates = [pred] in { def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; @@ -6895,7 +6951,7 @@ multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in { def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, @@ -7025,6 +7081,7 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, let Inst{4-0} = Rd; } +let mayRaiseFPException = 1 in class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm> : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "", [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>, @@ -7048,11 +7105,13 @@ multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm, multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { + let mayRaiseFPException = 1 in { def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">; def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">; let Predicates = [HasNEON, HasFullFP16] in { def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">; } + } def : InstAlias<asm # "\t$Rd, $Rn, #0", (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; @@ -7076,6 +7135,7 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm, (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>; } +let mayRaiseFPException = 1 in multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm, Predicate pred = HasNEON> { let Predicates = [pred] in { @@ -7087,6 +7147,7 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm, } } +let mayRaiseFPException = 1 in multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm, @@ -7169,6 +7230,7 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> { asm, ".2d">; } +let mayRaiseFPException = 1 in multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> { let Predicates = [HasNEON, HasFullFP16] in { def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64, @@ -7232,6 +7294,7 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> { asm, ".4s", []>; } +let mayRaiseFPException = 1 in multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm, Intrinsic intOp> { let Predicates = [HasNEON, HasFullFP16] in { @@ -7351,7 +7414,7 @@ class SIMDMovAlias<string asm, string size, Instruction inst, multiclass SMov { // SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME) // streaming mode. - let Predicates = [HasNEONorStreamingSVE] in { + let Predicates = [HasNEONorSME] in { def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> { let Inst{20-16} = 0b00001; } @@ -7398,7 +7461,7 @@ multiclass SMov { multiclass UMov { // UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME) // streaming mode. - let Predicates = [HasNEONorStreamingSVE] in { + let Predicates = [HasNEONorSME] in { def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> { let Inst{20-16} = 0b00001; } @@ -8048,6 +8111,7 @@ multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> { ".2h", V128, v4f32, v8bf16>; } +let mayRaiseFPException = 1 in class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode> : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), @@ -8056,6 +8120,7 @@ class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode> let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); } +let mayRaiseFPException = 1 in class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode> : I<(outs V128:$dst), (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm, @@ -8095,18 +8160,21 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm> ", $Rm", ".8h", "}"); } +let mayRaiseFPException = 1 in class SIMD_BFCVTN : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, "bfcvtn", ".4h", ".4s", [(set (v8bf16 V128:$Rd), (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; +let mayRaiseFPException = 1 in class SIMD_BFCVTN2 : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, "bfcvtn2", ".8h", ".4s", [(set (v8bf16 V128:$dst), (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; +let mayRaiseFPException = 1 in class BF16ToSinglePrecision<string asm> : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, @@ -8160,6 +8228,7 @@ multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string as } // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed) +let mayRaiseFPException = 1 in class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm, string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, @@ -8187,6 +8256,7 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm, V128, v4f32, v8f16, OpNode>; } +let mayRaiseFPException = 1 in multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { @@ -8369,6 +8439,7 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> { V128:$Rm, VectorIndexD:$idx)>; } +let mayRaiseFPException = 1 in multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> { let Predicates = [HasNEON, HasFullFP16] in { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64, @@ -8701,9 +8772,8 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm, V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; @@ -8728,9 +8798,8 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm, V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8793,10 +8862,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm, [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), (v4i32 (int_aarch64_neon_sqdmull - (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 - (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8825,10 +8892,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm, [(set (v2i64 V128:$dst), (Accum (v2i64 V128:$Rd), (v2i64 (int_aarch64_neon_sqdmull - (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 - (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8881,9 +8946,8 @@ multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm, V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; @@ -8908,9 +8972,8 @@ multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm, V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8940,9 +9003,8 @@ multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), - (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8967,9 +9029,8 @@ multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), - (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9654,7 +9715,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm, V128, V128, vecshiftL8, asm#"2", ".8h", ".16b", [(set (v8i16 V128:$Rd), - (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> { + (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), vecshiftL8:$imm))]> { bits<3> imm; let Inst{18-16} = imm; } @@ -9670,7 +9731,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm, V128, V128, vecshiftL16, asm#"2", ".4s", ".8h", [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> { + (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), vecshiftL16:$imm))]> { bits<4> imm; let Inst{19-16} = imm; @@ -9687,7 +9748,7 @@ multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm, V128, V128, vecshiftL32, asm#"2", ".2d", ".4s", [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> { + (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), vecshiftL32:$imm))]> { bits<5> imm; let Inst{20-16} = imm; } @@ -10671,7 +10732,7 @@ def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm < let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">; let PrintMethod = "printComplexRotationOp<180, 90>"; } -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode, RegisterOperand regtype, Operand rottype, string asm, string kind, list<dag> pattern> @@ -10742,7 +10803,7 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype, } } -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size, bits<3> opcode, RegisterOperand regtype, @@ -10814,7 +10875,7 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode, } } -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 0, mayRaiseFPException = 1 in class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size, bit opc1, bit opc2, RegisterOperand dst_reg, RegisterOperand lhs_reg, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index a9191924129c..835a7b6cc81d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -42,6 +42,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -1094,7 +1095,10 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, return true; default:; } - return isSEHInstruction(MI); + if (isSEHInstruction(MI)) + return true; + auto Next = std::next(MI.getIterator()); + return Next != MBB->end() && Next->isCFIInstruction(); } /// analyzeCompare - For a comparison instruction, return the source registers @@ -1435,7 +1439,7 @@ bool AArch64InstrInfo::optimizeCompareInstr( return false; const MCInstrDesc &MCID = get(NewOpc); CmpInstr.setDesc(MCID); - CmpInstr.RemoveOperand(DeadNZCVIdx); + CmpInstr.removeOperand(DeadNZCVIdx); bool succeeded = UpdateOperandRegClass(CmpInstr); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); @@ -1547,27 +1551,6 @@ findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { } } -namespace { - -struct UsedNZCV { - bool N = false; - bool Z = false; - bool C = false; - bool V = false; - - UsedNZCV() = default; - - UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { - this->N |= UsedFlags.N; - this->Z |= UsedFlags.Z; - this->C |= UsedFlags.C; - this->V |= UsedFlags.V; - return *this; - } -}; - -} // end anonymous namespace - /// Find a condition code used by the instruction. /// Returns AArch64CC::Invalid if either the instruction does not use condition /// codes or we don't optimize CmpInstr in the presence of such instructions. @@ -1622,15 +1605,15 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { return UsedFlags; } -/// \returns Conditions flags used after \p CmpInstr in its MachineBB if they -/// are not containing C or V flags and NZCV flags are not alive in successors -/// of the same \p CmpInstr and \p MI parent. \returns None otherwise. +/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV +/// flags are not alive in successors of the same \p CmpInstr and \p MI parent. +/// \returns None otherwise. /// /// Collect instructions using that flags in \p CCUseInstrs if provided. -static Optional<UsedNZCV> -examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, - const TargetRegisterInfo &TRI, - SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) { +Optional<UsedNZCV> +llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, + const TargetRegisterInfo &TRI, + SmallVectorImpl<MachineInstr *> *CCUseInstrs) { MachineBasicBlock *CmpParent = CmpInstr.getParent(); if (MI.getParent() != CmpParent) return None; @@ -1652,8 +1635,6 @@ examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) break; } - if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V) - return None; return NZCVUsedAfterCmp; } @@ -1684,7 +1665,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) return false; - if (!examineCFlagsUse(MI, CmpInstr, TRI)) + Optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); + if (!NZVCUsed || NZVCUsed->C || NZVCUsed->V) return false; AccessKind AccessToCheck = AK_Write; @@ -1773,7 +1755,7 @@ static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); // Condition flags are not used in CmpInstr basic block successors and only // Z or N flags allowed to be used after CmpInstr within its basic block - if (!NZCVUsedAfterCmp) + if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) return false; // Z or N flag used after CmpInstr must correspond to the flag used in MI if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || @@ -2270,6 +2252,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { case AArch64::LD1SW_D_IMM: case AArch64::LD1D_IMM: + case AArch64::LD2B_IMM: + case AArch64::LD2H_IMM: + case AArch64::LD2W_IMM: + case AArch64::LD2D_IMM: + case AArch64::LD3B_IMM: + case AArch64::LD3H_IMM: + case AArch64::LD3W_IMM: + case AArch64::LD3D_IMM: + case AArch64::LD4B_IMM: + case AArch64::LD4H_IMM: + case AArch64::LD4W_IMM: + case AArch64::LD4D_IMM: + case AArch64::ST1B_IMM: case AArch64::ST1B_H_IMM: case AArch64::ST1B_S_IMM: @@ -2281,6 +2276,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { case AArch64::ST1W_D_IMM: case AArch64::ST1D_IMM: + case AArch64::ST2B_IMM: + case AArch64::ST2H_IMM: + case AArch64::ST2W_IMM: + case AArch64::ST2D_IMM: + case AArch64::ST3B_IMM: + case AArch64::ST3H_IMM: + case AArch64::ST3W_IMM: + case AArch64::ST3D_IMM: + case AArch64::ST4B_IMM: + case AArch64::ST4H_IMM: + case AArch64::ST4W_IMM: + case AArch64::ST4D_IMM: + case AArch64::LD1RB_IMM: case AArch64::LD1RB_H_IMM: case AArch64::LD1RB_S_IMM: @@ -2897,6 +2905,45 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, MinOffset = -8; MaxOffset = 7; break; + case AArch64::LD2B_IMM: + case AArch64::LD2H_IMM: + case AArch64::LD2W_IMM: + case AArch64::LD2D_IMM: + case AArch64::ST2B_IMM: + case AArch64::ST2H_IMM: + case AArch64::ST2W_IMM: + case AArch64::ST2D_IMM: + Scale = TypeSize::Scalable(32); + Width = SVEMaxBytesPerVector * 2; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD3B_IMM: + case AArch64::LD3H_IMM: + case AArch64::LD3W_IMM: + case AArch64::LD3D_IMM: + case AArch64::ST3B_IMM: + case AArch64::ST3H_IMM: + case AArch64::ST3W_IMM: + case AArch64::ST3D_IMM: + Scale = TypeSize::Scalable(48); + Width = SVEMaxBytesPerVector * 3; + MinOffset = -8; + MaxOffset = 7; + break; + case AArch64::LD4B_IMM: + case AArch64::LD4H_IMM: + case AArch64::LD4W_IMM: + case AArch64::LD4D_IMM: + case AArch64::ST4B_IMM: + case AArch64::ST4H_IMM: + case AArch64::ST4W_IMM: + case AArch64::ST4D_IMM: + Scale = TypeSize::Scalable(64); + Width = SVEMaxBytesPerVector * 4; + MinOffset = -8; + MaxOffset = 7; + break; case AArch64::LD1B_H_IMM: case AArch64::LD1SB_H_IMM: case AArch64::LD1H_S_IMM: @@ -3105,6 +3152,86 @@ bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { return isPreLd(MI) || isPreSt(MI); } +bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + case AArch64::STGPi: + return true; + } +} + +const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 + : 1; + return MI.getOperand(Idx); +} + +const MachineOperand & +AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 + : 2; + return MI.getOperand(Idx); +} + +static const TargetRegisterClass *getRegClass(const MachineInstr &MI, + Register Reg) { + if (MI.getParent() == nullptr) + return nullptr; + const MachineFunction *MF = MI.getParent()->getParent(); + return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; +} + +bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { + auto IsQFPR = [&](const MachineOperand &Op) { + if (!Op.isReg()) + return false; + auto Reg = Op.getReg(); + if (Reg.isPhysical()) + return AArch64::FPR128RegClass.contains(Reg); + const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); + return TRC == &AArch64::FPR128RegClass || + TRC == &AArch64::FPR128_loRegClass; + }; + return llvm::any_of(MI.operands(), IsQFPR); +} + +bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { + auto IsFPR = [&](const MachineOperand &Op) { + if (!Op.isReg()) + return false; + auto Reg = Op.getReg(); + if (Reg.isPhysical()) + return AArch64::FPR128RegClass.contains(Reg) || + AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR32RegClass.contains(Reg) || + AArch64::FPR16RegClass.contains(Reg) || + AArch64::FPR8RegClass.contains(Reg); + + const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); + return TRC == &AArch64::FPR128RegClass || + TRC == &AArch64::FPR128_loRegClass || + TRC == &AArch64::FPR64RegClass || + TRC == &AArch64::FPR64_loRegClass || + TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || + TRC == &AArch64::FPR8RegClass; + }; + return llvm::any_of(MI.operands(), IsFPR); +} + // Scale the unscaled offsets. Returns false if the unscaled offset can't be // scaled. static bool scaleOffset(unsigned Opc, int64_t &Offset) { @@ -3370,7 +3497,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Predicate register by ORRing with itself. if (AArch64::PPRRegClass.contains(DestReg) && AArch64::PPRRegClass.contains(SrcReg)) { - assert(Subtarget.hasSVE() && "Unexpected SVE register."); + assert((Subtarget.hasSVE() || Subtarget.hasSME()) && + "Unexpected SVE register."); BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) .addReg(SrcReg) // Pg .addReg(SrcReg) @@ -3381,7 +3509,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register by ORRing with itself. if (AArch64::ZPRRegClass.contains(DestReg) && AArch64::ZPRRegClass.contains(SrcReg)) { - assert(Subtarget.hasSVE() && "Unexpected SVE register."); + assert((Subtarget.hasSVE() || Subtarget.hasSME()) && + "Unexpected SVE register."); BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -3391,6 +3520,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register pair by copying the individual sub-registers. if (AArch64::ZPR2RegClass.contains(DestReg) && AArch64::ZPR2RegClass.contains(SrcReg)) { + assert((Subtarget.hasSVE() || Subtarget.hasSME()) && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, Indices); @@ -3400,6 +3531,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register triple by copying the individual sub-registers. if (AArch64::ZPR3RegClass.contains(DestReg) && AArch64::ZPR3RegClass.contains(SrcReg)) { + assert((Subtarget.hasSVE() || Subtarget.hasSME()) && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, AArch64::zsub2}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, @@ -3410,6 +3543,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a Z register quad by copying the individual sub-registers. if (AArch64::ZPR4RegClass.contains(DestReg) && AArch64::ZPR4RegClass.contains(SrcReg)) { + assert((Subtarget.hasSVE() || Subtarget.hasSME()) && + "Unexpected SVE register."); static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, AArch64::zsub2, AArch64::zsub3}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, @@ -3979,6 +4114,119 @@ void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( } } +// Convenience function to create a DWARF expression for +// Expr + NumBytes + NumVGScaledBytes * AArch64::VG +static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, + int NumVGScaledBytes, unsigned VG, + llvm::raw_string_ostream &Comment) { + uint8_t buffer[16]; + + if (NumBytes) { + Expr.push_back(dwarf::DW_OP_consts); + Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); + Expr.push_back((uint8_t)dwarf::DW_OP_plus); + Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); + } + + if (NumVGScaledBytes) { + Expr.push_back((uint8_t)dwarf::DW_OP_consts); + Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); + + Expr.push_back((uint8_t)dwarf::DW_OP_bregx); + Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); + Expr.push_back(0); + + Expr.push_back((uint8_t)dwarf::DW_OP_mul); + Expr.push_back((uint8_t)dwarf::DW_OP_plus); + + Comment << (NumVGScaledBytes < 0 ? " - " : " + ") + << std::abs(NumVGScaledBytes) << " * VG"; + } +} + +// Creates an MCCFIInstruction: +// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } +static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, + unsigned Reg, + const StackOffset &Offset) { + int64_t NumBytes, NumVGScaledBytes; + AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, + NumVGScaledBytes); + std::string CommentBuffer; + llvm::raw_string_ostream Comment(CommentBuffer); + + if (Reg == AArch64::SP) + Comment << "sp"; + else if (Reg == AArch64::FP) + Comment << "fp"; + else + Comment << printReg(Reg, &TRI); + + // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) + SmallString<64> Expr; + unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); + Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); + Expr.push_back(0); + appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, + TRI.getDwarfRegNum(AArch64::VG, true), Comment); + + // Wrap this into DW_CFA_def_cfa. + SmallString<64> DefCfaExpr; + DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); + uint8_t buffer[16]; + DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); + DefCfaExpr.append(Expr.str()); + return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), + Comment.str()); +} + +MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, + unsigned FrameReg, unsigned Reg, + const StackOffset &Offset, + bool LastAdjustmentWasScalable) { + if (Offset.getScalable()) + return createDefCFAExpression(TRI, Reg, Offset); + + if (FrameReg == Reg && !LastAdjustmentWasScalable) + return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); + + unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); + return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); +} + +MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, + unsigned Reg, + const StackOffset &OffsetFromDefCFA) { + int64_t NumBytes, NumVGScaledBytes; + AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( + OffsetFromDefCFA, NumBytes, NumVGScaledBytes); + + unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); + + // Non-scalable offsets can use DW_CFA_offset directly. + if (!NumVGScaledBytes) + return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); + + std::string CommentBuffer; + llvm::raw_string_ostream Comment(CommentBuffer); + Comment << printReg(Reg, &TRI) << " @ cfa"; + + // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) + SmallString<64> OffsetExpr; + appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, + TRI.getDwarfRegNum(AArch64::VG, true), Comment); + + // Wrap this into DW_CFA_expression + SmallString<64> CfaExpr; + CfaExpr.push_back(dwarf::DW_CFA_expression); + uint8_t buffer[16]; + CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); + CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); + CfaExpr.append(OffsetExpr.str()); + + return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str()); +} + // Helper function to emit a frame offset adjustment from a given // pointer (SrcReg), stored into DestReg. This function is explicit // in that it requires the opcode. @@ -3988,7 +4236,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, - bool *HasWinCFI) { + bool *HasWinCFI, bool EmitCFAOffset, + StackOffset CFAOffset, unsigned FrameReg) { int Sign = 1; unsigned MaxEncoding, ShiftSize; switch (Opc) { @@ -4013,6 +4262,13 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, llvm_unreachable("Unsupported opcode"); } + // `Offset` can be in bytes or in "scalable bytes". + int VScale = 1; + if (Opc == AArch64::ADDVL_XXI) + VScale = 16; + else if (Opc == AArch64::ADDPL_XXI) + VScale = 2; + // FIXME: If the offset won't fit in 24-bits, compute the offset into a // scratch register. If DestReg is a virtual register, use it as the // scratch register; otherwise, create a new virtual register (to be @@ -4050,6 +4306,26 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); MBI = MBI.setMIFlag(Flag); + auto Change = + VScale == 1 + ? StackOffset::getFixed(ThisVal << LocalShiftSize) + : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); + if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) + CFAOffset += Change; + else + CFAOffset -= Change; + if (EmitCFAOffset && DestReg == TmpReg) { + MachineFunction &MF = *MBB.getParent(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + + unsigned CFIIndex = MF.addFrameInst( + createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(Flag); + } + if (NeedsWinCFI) { assert(Sign == 1 && "SEH directives should always have a positive sign"); int Imm = (int)(ThisVal << LocalShiftSize); @@ -4086,7 +4362,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, - bool NeedsWinCFI, bool *HasWinCFI) { + bool NeedsWinCFI, bool *HasWinCFI, + bool EmitCFAOffset, StackOffset CFAOffset, + unsigned FrameReg) { int64_t Bytes, NumPredicateVectors, NumDataVectors; AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( Offset, Bytes, NumPredicateVectors, NumDataVectors); @@ -4101,8 +4379,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; } emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, - NeedsWinCFI, HasWinCFI); + NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, + FrameReg); + CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) + ? StackOffset::getFixed(-Bytes) + : StackOffset::getFixed(Bytes); SrcReg = DestReg; + FrameReg = DestReg; } assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && @@ -4112,14 +4395,17 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, if (NumDataVectors) { emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, - AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr, + EmitCFAOffset, CFAOffset, FrameReg); + CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); SrcReg = DestReg; } if (NumPredicateVectors) { assert(DestReg != AArch64::SP && "Unaligned access to SP"); emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, - AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr, + EmitCFAOffset, CFAOffset, FrameReg); } } @@ -4151,6 +4437,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); return nullptr; } + // Nothing can folded with copy from/to NZCV. + if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) + return nullptr; } // Handle the case where a copy is being spilled or filled but the source @@ -4577,6 +4866,10 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, return false; } + if (isCombineInstrSettingFlag(CombineOpc) && + MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return false; + return true; } @@ -4919,6 +5212,10 @@ static bool getFMULPatterns(MachineInstr &Root, MachineInstr *MI = nullptr; if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); + // Ignore No-op COPYs in FMUL(COPY(DUP(..))) + if (MI && MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getReg().isVirtual()) + MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); if (MI && MI->getOpcode() == Opcode) { Patterns.push_back(Pattern); return true; @@ -5073,6 +5370,42 @@ bool AArch64InstrInfo::isThroughputPattern( } // end switch (Pattern) return false; } + +/// Find other MI combine patterns. +static bool getMiscPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) +{ + // A - (B + C) ==> (A - B) - C or (A - C) - B + unsigned Opc = Root.getOpcode(); + MachineBasicBlock &MBB = *Root.getParent(); + + switch (Opc) { + case AArch64::SUBWrr: + case AArch64::SUBSWrr: + case AArch64::SUBXrr: + case AArch64::SUBSXrr: + // Found candidate root. + break; + default: + return false; + } + + if (isCombineInstrSettingFlag(Opc) && + Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return false; + + if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || + canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || + canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || + canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { + Patterns.push_back(MachineCombinerPattern::SUBADD_OP1); + Patterns.push_back(MachineCombinerPattern::SUBADD_OP2); + return true; + } + + return false; +} + /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the @@ -5090,6 +5423,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getFMAPatterns(Root, Patterns)) return true; + // Other patterns + if (getMiscPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -5190,6 +5527,9 @@ genIndexedMultiply(MachineInstr &Root, MachineInstr *Dup = MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); + if (Dup->getOpcode() == TargetOpcode::COPY) + Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); + Register DupSrcReg = Dup->getOperand(1).getReg(); MRI.clearKillFlags(DupSrcReg); MRI.constrainRegClass(DupSrcReg, RC); @@ -5337,6 +5677,53 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, return MUL; } +/// Do the following transformation +/// A - (B + C) ==> (A - B) - C +/// A - (B + C) ==> (A - C) - B +static void +genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + unsigned IdxOpd1, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { + assert(IdxOpd1 == 1 || IdxOpd1 == 2); + unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; + MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); + + Register ResultReg = Root.getOperand(0).getReg(); + Register RegA = Root.getOperand(1).getReg(); + bool RegAIsKill = Root.getOperand(1).isKill(); + Register RegB = AddMI->getOperand(IdxOpd1).getReg(); + bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); + Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); + bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); + Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); + + unsigned Opcode = Root.getOpcode(); + if (Opcode == AArch64::SUBSWrr) + Opcode = AArch64::SUBWrr; + else if (Opcode == AArch64::SUBSXrr) + Opcode = AArch64::SUBXrr; + else + assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && + "Unexpected instruction opcode."); + + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), NewVR) + .addReg(RegA, getKillRegState(RegAIsKill)) + .addReg(RegB, getKillRegState(RegBIsKill)); + MachineInstrBuilder MIB2 = + BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), ResultReg) + .addReg(NewVR, getKillRegState(true)) + .addReg(RegC, getKillRegState(RegCIsKill)); + + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + InsInstrs.push_back(MIB1); + InsInstrs.push_back(MIB2); + DelInstrs.push_back(AddMI); +} + /// When getMachineCombinerPatterns() finds potential patterns, /// this function generates the instructions that could replace the /// original code sequence @@ -5359,6 +5746,18 @@ void AArch64InstrInfo::genAlternativeCodeSequence( TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); return; + case MachineCombinerPattern::SUBADD_OP1: + // A - (B + C) + // ==> (A - B) - C + genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, + InstrIdxForVirtReg); + break; + case MachineCombinerPattern::SUBADD_OP2: + // A - (B + C) + // ==> (A - C) - B + genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, + InstrIdxForVirtReg); + break; case MachineCombinerPattern::MULADDW_OP1: case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 @@ -6214,6 +6613,14 @@ void AArch64InstrInfo::genAlternativeCodeSequence( if (MUL) DelInstrs.push_back(MUL); DelInstrs.push_back(&Root); + + // Set the flags on the inserted instructions to be the merged flags of the + // instructions that we have combined. + uint16_t Flags = Root.getFlags(); + if (MUL) + Flags = Root.mergeFlagsWith(*MUL); + for (auto *MI : InsInstrs) + MI->setFlags(Flags); } /// Replace csincr-branch sequence by simple conditional branch @@ -6526,13 +6933,12 @@ enum MachineOutlinerMBBFlags { UnsafeRegsDead = 0x8 }; -unsigned -AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { - assert(C.LRUWasSet && "LRU wasn't set?"); +Register +AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { MachineFunction *MF = C.getMF(); - const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( - MF->getSubtarget().getRegisterInfo()); - + const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); + const AArch64RegisterInfo *ARI = + static_cast<const AArch64RegisterInfo *>(&TRI); // Check if there is an available register across the sequence that we can // use. for (unsigned Reg : AArch64::GPR64RegClass) { @@ -6540,12 +6946,11 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { Reg != AArch64::LR && // LR is not reserved, but don't use it. Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. Reg != AArch64::X17 && // Ditto for X17. - C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) + C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && + C.isAvailableInsideSeq(Reg, TRI)) return Reg; } - - // No suitable register. Return 0. - return 0u; + return Register(); } static bool @@ -6691,10 +7096,8 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( unsigned FlagsSetInAll = 0xF; // Compute liveness information for each candidate, and set FlagsSetInAll. - std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), - [&FlagsSetInAll](outliner::Candidate &C) { - FlagsSetInAll &= C.Flags; - }); + for (outliner::Candidate &C : RepeatedSequenceLocs) + FlagsSetInAll &= C.Flags; // According to the AArch64 Procedure Call Standard, the following are // undefined on entry/exit from a function call: @@ -6712,10 +7115,8 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( // to compute liveness here. if (C.Flags & UnsafeRegsDead) return false; - C.initLRU(TRI); - LiveRegUnits LRU = C.LRU; - return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || - !LRU.available(AArch64::NZCV)); + return C.isAnyUnavailableAcrossOrOutOfSeq( + {AArch64::W16, AArch64::W17, AArch64::NZCV}, TRI); }; // Are there any candidates where those registers are live? @@ -6752,12 +7153,10 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( // We check to see if CFI Instructions are present, and if they are // we find the number of CFI Instructions in the candidates. unsigned CFICount = 0; - MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); - for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); - Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { - if (MBBI->isCFIInstruction()) + for (auto &I : make_range(RepeatedSequenceLocs[0].front(), + std::next(RepeatedSequenceLocs[0].back()))) { + if (I.isCFIInstruction()) CFICount++; - MBBI++; } // We compare the number of found CFI Instructions to the number of CFI @@ -6860,8 +7259,6 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( // Check if we have to save LR. for (outliner::Candidate &C : RepeatedSequenceLocs) { - C.initLRU(TRI); - // If we have a noreturn caller, then we're going to be conservative and // say that we have to save LR. If we don't have a ret at the end of the // block, then we can't reason about liveness accurately. @@ -6872,7 +7269,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); // Is LR available? If so, we don't need a save. - if (C.LRU.available(AArch64::LR) && !IsNoReturn) { + if (C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) && !IsNoReturn) { NumBytesNoStackCalls += 4; C.setCallInfo(MachineOutlinerNoLRSave, 4); CandidatesWithoutStackFixups.push_back(C); @@ -6888,7 +7285,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( // Is SP used in the sequence at all? If not, we don't have to modify // the stack, so we are guaranteed to get the same frame. - else if (C.UsedInSequence.available(AArch64::SP)) { + else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { NumBytesNoStackCalls += 12; C.setCallInfo(MachineOutlinerDefault, 12); CandidatesWithoutStackFixups.push_back(C); @@ -6957,11 +7354,12 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( // LR to (ie one extra stack save/restore). // if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { - erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) { + erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { return (std::any_of( C.front(), std::next(C.back()), [](const MachineInstr &MI) { return MI.isCall(); })) && - (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C)); + (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || + !findRegisterToSaveLRTo(C)); }); } } @@ -7032,7 +7430,7 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( // modify the stack. Check if hasRedZone is true or unknown; if yes, don't // outline from it. AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - if (!AFI || AFI->hasRedZone().getValueOr(true)) + if (!AFI || AFI->hasRedZone().value_or(true)) return false; // FIXME: Teach the outliner to generate/handle Windows unwind info. @@ -7053,8 +7451,8 @@ bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, "Suitable Machine Function for outlining must track liveness"); LiveRegUnits LRU(getRegisterInfo()); - std::for_each(MBB.rbegin(), MBB.rend(), - [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); + for (MachineInstr &MI : llvm::reverse(MBB)) + LRU.accumulate(MI); // Check if each of the unsafe registers are available... bool W16AvailableInBlock = LRU.available(AArch64::W16); @@ -7333,14 +7731,17 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(AArch64::SP, RegState::InternalRead); MI.setMIFlag(MachineInstr::FrameSetup); - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); - BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo()) { + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } // If v8.3a features are available we can replace a RET instruction by - // RETAA or RETAB and omit the AUT instructions + // RETAA or RETAB and omit the AUT instructions. In this case the + // DW_CFA_AARCH64_negate_ra_state can't be emitted. if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && MBBAUT->getOpcode() == AArch64::RET) { BuildMI(MBB, MBBAUT, DL, @@ -7353,6 +7754,11 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP : AArch64::AUTIBSP)) .setMIFlag(MachineInstr::FrameDestroy); + unsigned CFIIndexAuth = + MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndexAuth) + .setMIFlags(MachineInstr::FrameDestroy); } } } @@ -7424,24 +7830,26 @@ void AArch64InstrInfo::buildOutlinedFrame( .addImm(-16); It = MBB.insert(It, STRXpre); - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const MCRegisterInfo *MRI = STI.getRegisterInfo(); - unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); - - // Add a CFI saying the stack was moved 16 B down. - int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); - BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) - .addCFIIndex(StackPosEntry) - .setMIFlags(MachineInstr::FrameSetup); - - // Add a CFI saying that the LR that we want to find is now 16 B higher than - // before. - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); - BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameSetup); + if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo()) { + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const MCRegisterInfo *MRI = STI.getRegisterInfo(); + unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); + + // Add a CFI saying the stack was moved 16 B down. + int64_t StackPosEntry = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); + BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) + .addCFIIndex(StackPosEntry) + .setMIFlags(MachineInstr::FrameSetup); + + // Add a CFI saying that the LR that we want to find is now 16 B higher + // than before. + int64_t LRPosEntry = MF.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); + BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) + .addCFIIndex(LRPosEntry) + .setMIFlags(MachineInstr::FrameSetup); + } // Insert a restore before the terminator for the function. MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) @@ -7495,7 +7903,7 @@ void AArch64InstrInfo::buildOutlinedFrame( MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, - MachineFunction &MF, const outliner::Candidate &C) const { + MachineFunction &MF, outliner::Candidate &C) const { // Are we tail calling? if (C.CallConstructionID == MachineOutlinerTailCall) { @@ -7526,8 +7934,8 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( if (C.CallConstructionID == MachineOutlinerRegSave) { // FIXME: This logic should be sunk into a target-specific interface so that // we don't have to recompute the register. - unsigned Reg = findRegisterToSaveLRTo(C); - assert(Reg != 0 && "No callee-saved register available?"); + Register Reg = findRegisterToSaveLRTo(C); + assert(Reg && "No callee-saved register available?"); // LR has to be a live in so that we can save it. if (!MBB.isLiveIn(AArch64::LR)) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 1054bea40e68..b7a6ac301cdc 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -103,6 +103,21 @@ public: /// Returns whether the instruction is a pre-indexed load/store. static bool isPreLdSt(const MachineInstr &MI); + /// Returns whether the instruction is a paired load/store. + static bool isPairedLdSt(const MachineInstr &MI); + + /// Returns the base register operator of a load/store. + static const MachineOperand &getLdStBaseOp(const MachineInstr &MI); + + /// Returns the the immediate offset operator of a load/store. + static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); + + /// Returns whether the instruction is FP or NEON. + static bool isFpOrNEON(const MachineInstr &MI); + + /// Returns whether the instruction is in Q form (128 bit operands) + static bool isQForm(const MachineInstr &MI); + /// Returns the index for the immediate for a given instruction. static unsigned getLoadStoreImmIdx(unsigned Opc); @@ -283,7 +298,7 @@ public: MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, - const outliner::Candidate &C) const override; + outliner::Candidate &C) const override; bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; /// Returns the vector element size (B, H, S or D) of an SVE opcode. uint64_t getElementSizeForOpcode(unsigned Opc) const; @@ -347,7 +362,7 @@ private: /// Returns an unused general-purpose register which can be used for /// constructing an outlined call if one exists. Returns 0 otherwise. - unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; + Register findRegisterToSaveLRTo(outliner::Candidate &C) const; /// Remove a ptest of a predicate-generating operation that already sets, or /// can be made to set, the condition codes in an identical manner @@ -356,12 +371,45 @@ private: const MachineRegisterInfo *MRI) const; }; +struct UsedNZCV { + bool N = false; + bool Z = false; + bool C = false; + bool V = false; + + UsedNZCV() = default; + + UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { + this->N |= UsedFlags.N; + this->Z |= UsedFlags.Z; + this->C |= UsedFlags.C; + this->V |= UsedFlags.V; + return *this; + } +}; + +/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV +/// flags are not alive in successors of the same \p CmpInstr and \p MI parent. +/// \returns None otherwise. +/// +/// Collect instructions using that flags in \p CCUseInstrs if provided. +Optional<UsedNZCV> +examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, + const TargetRegisterInfo &TRI, + SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr); + /// Return true if there is an instruction /after/ \p DefMI and before \p UseMI /// which either reads or clobbers NZCV. bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI); +MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, + unsigned Reg, const StackOffset &Offset, + bool LastAdjustmentWasScalable = true); +MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, + const StackOffset &OffsetFromDefCFA); + /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg /// plus Offset. This is intended to be used from within the prolog/epilog /// insertion (PEI) pass, where a virtual scratch register may be allocated @@ -371,7 +419,9 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag = MachineInstr::NoFlags, bool SetNZCV = false, bool NeedsWinCFI = false, - bool *HasWinCFI = nullptr); + bool *HasWinCFI = nullptr, bool EmitCFAOffset = false, + StackOffset InitialOffset = {}, + unsigned FrameReg = AArch64::SP); /// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the /// FP. Return false if the offset could not be handled directly in MI, and diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 83bf89ff97c5..3802a45ad6c1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -14,196 +14,196 @@ // ARM Instruction Predicate Definitions. // def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, - AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">; + AssemblerPredicateWithAll<(all_of HasV8_1aOps), "armv8.1a">; def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, - AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">; + AssemblerPredicateWithAll<(all_of HasV8_2aOps), "armv8.2a">; def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, - AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">; + AssemblerPredicateWithAll<(all_of HasV8_3aOps), "armv8.3a">; def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, - AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">; + AssemblerPredicateWithAll<(all_of HasV8_4aOps), "armv8.4a">; def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, - AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">; + AssemblerPredicateWithAll<(all_of HasV8_5aOps), "armv8.5a">; def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">, - AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">; + AssemblerPredicateWithAll<(all_of HasV8_6aOps), "armv8.6a">; def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">, - AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">; + AssemblerPredicateWithAll<(all_of HasV8_7aOps), "armv8.7a">; def HasV9_0a : Predicate<"Subtarget->hasV9_0aOps()">, - AssemblerPredicate<(all_of HasV9_0aOps), "armv9-a">; + AssemblerPredicateWithAll<(all_of HasV9_0aOps), "armv9-a">; def HasV9_1a : Predicate<"Subtarget->hasV9_1aOps()">, - AssemblerPredicate<(all_of HasV9_1aOps), "armv9.1a">; + AssemblerPredicateWithAll<(all_of HasV9_1aOps), "armv9.1a">; def HasV9_2a : Predicate<"Subtarget->hasV9_2aOps()">, - AssemblerPredicate<(all_of HasV9_2aOps), "armv9.2a">; + AssemblerPredicateWithAll<(all_of HasV9_2aOps), "armv9.2a">; def HasV9_3a : Predicate<"Subtarget->hasV9_3aOps()">, - AssemblerPredicate<(all_of HasV9_3aOps), "armv9.3a">; + AssemblerPredicateWithAll<(all_of HasV9_3aOps), "armv9.3a">; def HasV8_0r : Predicate<"Subtarget->hasV8_0rOps()">, - AssemblerPredicate<(all_of HasV8_0rOps), "armv8-r">; + AssemblerPredicateWithAll<(all_of HasV8_0rOps), "armv8-r">; def HasEL2VMSA : Predicate<"Subtarget->hasEL2VMSA()">, - AssemblerPredicate<(all_of FeatureEL2VMSA), "el2vmsa">; + AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">; def HasEL3 : Predicate<"Subtarget->hasEL3()">, - AssemblerPredicate<(all_of FeatureEL3), "el3">; + AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">; def HasVH : Predicate<"Subtarget->hasVH()">, - AssemblerPredicate<(all_of FeatureVH), "vh">; + AssemblerPredicateWithAll<(all_of FeatureVH), "vh">; def HasLOR : Predicate<"Subtarget->hasLOR()">, - AssemblerPredicate<(all_of FeatureLOR), "lor">; + AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">; def HasPAuth : Predicate<"Subtarget->hasPAuth()">, - AssemblerPredicate<(all_of FeaturePAuth), "pauth">; + AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">; def HasJS : Predicate<"Subtarget->hasJS()">, - AssemblerPredicate<(all_of FeatureJS), "jsconv">; + AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">; def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">, - AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">; + AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">; def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">, - AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">; + AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">; def HasNV : Predicate<"Subtarget->hasNV()">, - AssemblerPredicate<(all_of FeatureNV), "nv">; + AssemblerPredicateWithAll<(all_of FeatureNV), "nv">; def HasMPAM : Predicate<"Subtarget->hasMPAM()">, - AssemblerPredicate<(all_of FeatureMPAM), "mpam">; + AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">; def HasDIT : Predicate<"Subtarget->hasDIT()">, - AssemblerPredicate<(all_of FeatureDIT), "dit">; + AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">; def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">, - AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">; + AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">; def HasAM : Predicate<"Subtarget->hasAM()">, - AssemblerPredicate<(all_of FeatureAM), "am">; + AssemblerPredicateWithAll<(all_of FeatureAM), "am">; def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, - AssemblerPredicate<(all_of FeatureSEL2), "sel2">; + AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">; def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, - AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">; + AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">; def HasFlagM : Predicate<"Subtarget->hasFlagM()">, - AssemblerPredicate<(all_of FeatureFlagM), "flagm">; + AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">; def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">, - AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">; + AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, - AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">; + AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<(all_of FeatureNEON), "neon">; + AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">; def HasCrypto : Predicate<"Subtarget->hasCrypto()">, - AssemblerPredicate<(all_of FeatureCrypto), "crypto">; + AssemblerPredicateWithAll<(all_of FeatureCrypto), "crypto">; def HasSM4 : Predicate<"Subtarget->hasSM4()">, - AssemblerPredicate<(all_of FeatureSM4), "sm4">; + AssemblerPredicateWithAll<(all_of FeatureSM4), "sm4">; def HasSHA3 : Predicate<"Subtarget->hasSHA3()">, - AssemblerPredicate<(all_of FeatureSHA3), "sha3">; + AssemblerPredicateWithAll<(all_of FeatureSHA3), "sha3">; def HasSHA2 : Predicate<"Subtarget->hasSHA2()">, - AssemblerPredicate<(all_of FeatureSHA2), "sha2">; + AssemblerPredicateWithAll<(all_of FeatureSHA2), "sha2">; def HasAES : Predicate<"Subtarget->hasAES()">, - AssemblerPredicate<(all_of FeatureAES), "aes">; + AssemblerPredicateWithAll<(all_of FeatureAES), "aes">; def HasDotProd : Predicate<"Subtarget->hasDotProd()">, - AssemblerPredicate<(all_of FeatureDotProd), "dotprod">; + AssemblerPredicateWithAll<(all_of FeatureDotProd), "dotprod">; def HasCRC : Predicate<"Subtarget->hasCRC()">, - AssemblerPredicate<(all_of FeatureCRC), "crc">; + AssemblerPredicateWithAll<(all_of FeatureCRC), "crc">; def HasLSE : Predicate<"Subtarget->hasLSE()">, - AssemblerPredicate<(all_of FeatureLSE), "lse">; + AssemblerPredicateWithAll<(all_of FeatureLSE), "lse">; def HasNoLSE : Predicate<"!Subtarget->hasLSE()">; def HasRAS : Predicate<"Subtarget->hasRAS()">, - AssemblerPredicate<(all_of FeatureRAS), "ras">; + AssemblerPredicateWithAll<(all_of FeatureRAS), "ras">; def HasRDM : Predicate<"Subtarget->hasRDM()">, - AssemblerPredicate<(all_of FeatureRDM), "rdm">; + AssemblerPredicateWithAll<(all_of FeatureRDM), "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, - AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">; + AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">; def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, - AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">; + AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">; def HasSPE : Predicate<"Subtarget->hasSPE()">, - AssemblerPredicate<(all_of FeatureSPE), "spe">; + AssemblerPredicateWithAll<(all_of FeatureSPE), "spe">; def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, - AssemblerPredicate<(all_of FeatureFuseAES), + AssemblerPredicateWithAll<(all_of FeatureFuseAES), "fuse-aes">; def HasSVE : Predicate<"Subtarget->hasSVE()">, - AssemblerPredicate<(all_of FeatureSVE), "sve">; + AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">; def HasSVE2 : Predicate<"Subtarget->hasSVE2()">, - AssemblerPredicate<(all_of FeatureSVE2), "sve2">; + AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">; def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">, - AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">; + AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">; def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, - AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">; + AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">; def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, - AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">; + AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">; def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, - AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; + AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; def HasSME : Predicate<"Subtarget->hasSME()">, - AssemblerPredicate<(all_of FeatureSME), "sme">; + AssemblerPredicateWithAll<(all_of FeatureSME), "sme">; def HasSMEF64 : Predicate<"Subtarget->hasSMEF64()">, - AssemblerPredicate<(all_of FeatureSMEF64), "sme-f64">; + AssemblerPredicateWithAll<(all_of FeatureSMEF64), "sme-f64">; def HasSMEI64 : Predicate<"Subtarget->hasSMEI64()">, - AssemblerPredicate<(all_of FeatureSMEI64), "sme-i64">; -def HasStreamingSVE : Predicate<"Subtarget->hasStreamingSVE()">, - AssemblerPredicate<(all_of FeatureStreamingSVE), "streaming-sve">; + AssemblerPredicateWithAll<(all_of FeatureSMEI64), "sme-i64">; // A subset of SVE(2) instructions are legal in Streaming SVE execution mode, // they should be enabled if either has been specified. -def HasSVEorStreamingSVE - : Predicate<"Subtarget->hasSVE() || Subtarget->hasStreamingSVE()">, - AssemblerPredicate<(any_of FeatureSVE, FeatureStreamingSVE), - "streaming-sve or sve">; -def HasSVE2orStreamingSVE - : Predicate<"Subtarget->hasSVE2() || Subtarget->hasStreamingSVE()">, - AssemblerPredicate<(any_of FeatureSVE2, FeatureStreamingSVE), - "streaming-sve or sve2">; +def HasSVEorSME + : Predicate<"Subtarget->hasSVE() || Subtarget->hasSME()">, + AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME), + "sve or sme">; +def HasSVE2orSME + : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME()">, + AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME), + "sve2 or sme">; // A subset of NEON instructions are legal in Streaming SVE execution mode, // they should be enabled if either has been specified. -def HasNEONorStreamingSVE - : Predicate<"Subtarget->hasNEON() || Subtarget->hasStreamingSVE()">, - AssemblerPredicate<(any_of FeatureNEON, FeatureStreamingSVE), - "streaming-sve or neon">; +def HasNEONorSME + : Predicate<"Subtarget->hasNEON() || Subtarget->hasSME()">, + AssemblerPredicateWithAll<(any_of FeatureNEON, FeatureSME), + "neon or sme">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, - AssemblerPredicate<(all_of FeatureRCPC), "rcpc">; + AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">; +def HasLDAPR : Predicate<"Subtarget->hasLDAPR()">, + AssemblerPredicateWithAll<(all_of FeatureLDAPR), "ldapr">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, - AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">; + AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">; def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">, - AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">; + AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">; def HasSB : Predicate<"Subtarget->hasSB()">, - AssemblerPredicate<(all_of FeatureSB), "sb">; + AssemblerPredicateWithAll<(all_of FeatureSB), "sb">; def HasPredRes : Predicate<"Subtarget->hasPredRes()">, - AssemblerPredicate<(all_of FeaturePredRes), "predres">; + AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">; def HasCCDP : Predicate<"Subtarget->hasCCDP()">, - AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">; + AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">; def HasBTI : Predicate<"Subtarget->hasBTI()">, - AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">; + AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">; def HasMTE : Predicate<"Subtarget->hasMTE()">, - AssemblerPredicate<(all_of FeatureMTE), "mte">; + AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">; def HasTME : Predicate<"Subtarget->hasTME()">, - AssemblerPredicate<(all_of FeatureTME), "tme">; + AssemblerPredicateWithAll<(all_of FeatureTME), "tme">; def HasETE : Predicate<"Subtarget->hasETE()">, - AssemblerPredicate<(all_of FeatureETE), "ete">; + AssemblerPredicateWithAll<(all_of FeatureETE), "ete">; def HasTRBE : Predicate<"Subtarget->hasTRBE()">, - AssemblerPredicate<(all_of FeatureTRBE), "trbe">; + AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">; def HasBF16 : Predicate<"Subtarget->hasBF16()">, - AssemblerPredicate<(all_of FeatureBF16), "bf16">; + AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">; def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, - AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">; + AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">; def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, - AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">; + AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">; def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, - AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">; + AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">; def HasXS : Predicate<"Subtarget->hasXS()">, - AssemblerPredicate<(all_of FeatureXS), "xs">; + AssemblerPredicateWithAll<(all_of FeatureXS), "xs">; def HasWFxT : Predicate<"Subtarget->hasWFxT()">, - AssemblerPredicate<(all_of FeatureWFxT), "wfxt">; + AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">; def HasLS64 : Predicate<"Subtarget->hasLS64()">, - AssemblerPredicate<(all_of FeatureLS64), "ls64">; + AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">; def HasBRBE : Predicate<"Subtarget->hasBRBE()">, - AssemblerPredicate<(all_of FeatureBRBE), "brbe">; + AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">; def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">, - AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">; + AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">; def HasHBC : Predicate<"Subtarget->hasHBC()">, - AssemblerPredicate<(all_of FeatureHBC), "hbc">; + AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">; def HasMOPS : Predicate<"Subtarget->hasMOPS()">, - AssemblerPredicate<(all_of FeatureMOPS), "mops">; + AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -350,49 +350,49 @@ def nonext_masked_load : cast<MaskedLoadSDNode>(N)->isUnindexed() && !cast<MaskedLoadSDNode>(N)->isNonTemporal(); }]>; -// sign extending masked load fragments. -def asext_masked_load : +// Any/Zero extending masked load fragments. +def azext_masked_load : PatFrag<(ops node:$ptr, node:$pred, node:$def), (masked_ld node:$ptr, undef, node:$pred, node:$def),[{ return (cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD || - cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD) && + cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD) && cast<MaskedLoadSDNode>(N)->isUnindexed(); }]>; -def asext_masked_load_i8 : +def azext_masked_load_i8 : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + (azext_masked_load node:$ptr, node:$pred, node:$def), [{ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; -def asext_masked_load_i16 : +def azext_masked_load_i16 : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + (azext_masked_load node:$ptr, node:$pred, node:$def), [{ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; }]>; -def asext_masked_load_i32 : +def azext_masked_load_i32 : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + (azext_masked_load node:$ptr, node:$pred, node:$def), [{ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; -// zero extending masked load fragments. -def zext_masked_load : +// Sign extending masked load fragments. +def sext_masked_load : PatFrag<(ops node:$ptr, node:$pred, node:$def), (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ - return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD && + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD && cast<MaskedLoadSDNode>(N)->isUnindexed(); }]>; -def zext_masked_load_i8 : +def sext_masked_load_i8 : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + (sext_masked_load node:$ptr, node:$pred, node:$def), [{ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; -def zext_masked_load_i16 : +def sext_masked_load_i16 : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + (sext_masked_load node:$ptr, node:$pred, node:$def), [{ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; }]>; -def zext_masked_load_i32 : +def sext_masked_load_i32 : PatFrag<(ops node:$ptr, node:$pred, node:$def), - (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + (sext_masked_load node:$ptr, node:$pred, node:$def), [{ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; @@ -443,6 +443,58 @@ def non_temporal_store : cast<MaskedStoreSDNode>(N)->isNonTemporal(); }]>; +multiclass masked_gather_scatter<PatFrags GatherScatterOp> { + // offsets = (signed)Index << sizeof(elt) + def NAME#_signed_scaled : + PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx), + (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{ + auto MGS = cast<MaskedGatherScatterSDNode>(N); + bool Signed = MGS->isIndexSigned() || + MGS->getIndex().getValueType().getVectorElementType() == MVT::i64; + return Signed && MGS->isIndexScaled(); + }]>; + // offsets = (signed)Index + def NAME#_signed_unscaled : + PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx), + (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{ + auto MGS = cast<MaskedGatherScatterSDNode>(N); + bool Signed = MGS->isIndexSigned() || + MGS->getIndex().getValueType().getVectorElementType() == MVT::i64; + return Signed && !MGS->isIndexScaled(); + }]>; + // offsets = (unsigned)Index << sizeof(elt) + def NAME#_unsigned_scaled : + PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx), + (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{ + auto MGS = cast<MaskedGatherScatterSDNode>(N); + bool Signed = MGS->isIndexSigned() || + MGS->getIndex().getValueType().getVectorElementType() == MVT::i64; + return !Signed && MGS->isIndexScaled(); + }]>; + // offsets = (unsigned)Index + def NAME#_unsigned_unscaled : + PatFrag<(ops node:$val, node:$pred, node:$ptr, node:$idx), + (GatherScatterOp node:$val, node:$pred, node:$ptr, node:$idx),[{ + auto MGS = cast<MaskedGatherScatterSDNode>(N); + bool Signed = MGS->isIndexSigned() || + MGS->getIndex().getValueType().getVectorElementType() == MVT::i64; + return !Signed && !MGS->isIndexScaled(); + }]>; +} + +defm nonext_masked_gather : masked_gather_scatter<nonext_masked_gather>; +defm azext_masked_gather_i8 : masked_gather_scatter<azext_masked_gather_i8>; +defm azext_masked_gather_i16 : masked_gather_scatter<azext_masked_gather_i16>; +defm azext_masked_gather_i32 : masked_gather_scatter<azext_masked_gather_i32>; +defm sext_masked_gather_i8 : masked_gather_scatter<sext_masked_gather_i8>; +defm sext_masked_gather_i16 : masked_gather_scatter<sext_masked_gather_i16>; +defm sext_masked_gather_i32 : masked_gather_scatter<sext_masked_gather_i32>; + +defm nontrunc_masked_scatter : masked_gather_scatter<nontrunc_masked_scatter>; +defm trunc_masked_scatter_i8 : masked_gather_scatter<trunc_masked_scatter_i8>; +defm trunc_masked_scatter_i16 : masked_gather_scatter<trunc_masked_scatter_i16>; +defm trunc_masked_scatter_i32 : masked_gather_scatter<trunc_masked_scatter_i32>; + // top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise def top16Zero: PatLeaf<(i32 GPR32:$src), [{ return SDValue(N,0)->getValueType(0) == MVT::i32 && @@ -473,6 +525,11 @@ def AArch64call : SDNode<"AArch64ISD::CALL", [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +def AArch64call_bti : SDNode<"AArch64ISD::CALL_BTI", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER", SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, @@ -526,6 +583,7 @@ def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>; def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>; +def AArch64duplane128 : SDNode<"AArch64ISD::DUPLANE128", SDT_AArch64DupLane>; def AArch64insr : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>; @@ -612,8 +670,10 @@ def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>; def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>; -def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; -def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull, + [SDNPCommutative]>; +def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull, + [SDNPCommutative]>; def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>; @@ -630,11 +690,6 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; -def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>; -def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; -def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>; -def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>; - def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), [(abdu node:$lhs, node:$rhs), (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>; @@ -642,10 +697,21 @@ def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), [(abds node:$lhs, node:$rhs), (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; +def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>; def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>; +def AArch64saddlp_n : SDNode<"AArch64ISD::SADDLP", SDT_AArch64uaddlp>; +def AArch64addp : PatFrags<(ops node:$Rn, node:$Rm), + [(AArch64addp_n node:$Rn, node:$Rm), + (int_aarch64_neon_addp node:$Rn, node:$Rm)]>; def AArch64uaddlp : PatFrags<(ops node:$src), [(AArch64uaddlp_n node:$src), (int_aarch64_neon_uaddlp node:$src)]>; +def AArch64saddlp : PatFrags<(ops node:$src), + [(AArch64saddlp_n node:$src), + (int_aarch64_neon_saddlp node:$src)]>; +def AArch64faddp : PatFrags<(ops node:$Rn, node:$Rm), + [(AArch64addp_n node:$Rn, node:$Rm), + (int_aarch64_neon_faddp node:$Rn, node:$Rm)]>; def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -669,6 +735,22 @@ def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; def AArch64mrs : SDNode<"AArch64ISD::MRS", SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, [SDNPHasChain, SDNPOutGlue]>; + +// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands +// have no common bits. +def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), + [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{ + if (N->getOpcode() == ISD::ADD) + return true; + return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1)); +}]> { + let GISelPredicateCode = [{ + // Only handle G_ADD for now. FIXME. build capability to compute whether + // operands of G_OR have common bits set or not. + return MI.getOpcode() == TargetOpcode::G_ADD; + }]; +} + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -939,7 +1021,7 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot VectorIndexS:$idx)>; } -let Predicates = [HasNEONorStreamingSVE, HasBF16] in { +let Predicates = [HasNEONorSME, HasBF16] in { def BFCVT : BF16ToSinglePrecision<"bfcvt">; } @@ -1025,6 +1107,15 @@ def : EOR3_pattern<v8i16>; def : EOR3_pattern<v4i32>; def : EOR3_pattern<v2i64>; +class BCAX_pattern<ValueType VecTy> + : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))), + (BCAX (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>; + +def : BCAX_pattern<v16i8>; +def : BCAX_pattern<v8i16>; +def : BCAX_pattern<v4i32>; +def : BCAX_pattern<v2i64>; + def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v16i8>; def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v8i16>; def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v4i32>; @@ -2073,6 +2164,10 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>; def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>; +def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)), + (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))), + (REV16Xr GPR64:$Rn)>; + //===----------------------------------------------------------------------===// // Bitfield immediate extraction instruction. //===----------------------------------------------------------------------===// @@ -2320,6 +2415,8 @@ let isCall = 1, Defs = [LR], Uses = [SP] in { PseudoInstExpansion<(BLR GPR64:$Rn)>; def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>, Sched<[WriteBrReg]>; + def BLR_BTI : Pseudo<(outs), (ins variable_ops), []>, + Sched<[WriteBrReg]>; } // isCall def : Pat<(AArch64call GPR64:$Rn), @@ -2333,6 +2430,10 @@ def : Pat<(AArch64call_rvmarker (i64 tglobaladdr:$rvfunc), GPR64:$Rn), (BLR_RVMARKER tglobaladdr:$rvfunc, GPR64:$Rn)>, Requires<[NoSLSBLRMitigation]>; +def : Pat<(AArch64call_bti GPR64:$Rn), + (BLR_BTI GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; } // isBranch, isTerminator, isBarrier, isIndirectBranch @@ -2359,6 +2460,10 @@ def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> { // augmentation string. def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {} +// Pseudo instruction to tell the streamer to emit a 'G' character into the +// augmentation string. +def EMITMTETAGGED : Pseudo<(outs), (ins), []>, Sched<[]> {} + // FIXME: maybe the scratch register used shouldn't be fixed to X1? // FIXME: can "hasSideEffects be dropped? // This gets lowered to an instruction sequence which takes 16 bytes @@ -2409,7 +2514,8 @@ def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>; // Exception generation instructions. //===----------------------------------------------------------------------===// let isTrap = 1 in { -def BRK : ExceptionGeneration<0b001, 0b00, "brk">; +def BRK : ExceptionGeneration<0b001, 0b00, "brk", + [(int_aarch64_break timm32_0_65535:$imm)]>; } def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">; def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">; @@ -3891,24 +3997,24 @@ defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">; let Predicates = [HasFullFP16] in { - def : Pat<(i32 (lround f16:$Rn)), + def : Pat<(i32 (any_lround f16:$Rn)), (!cast<Instruction>(FCVTASUWHr) f16:$Rn)>; - def : Pat<(i64 (lround f16:$Rn)), + def : Pat<(i64 (any_lround f16:$Rn)), (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>; - def : Pat<(i64 (llround f16:$Rn)), + def : Pat<(i64 (any_llround f16:$Rn)), (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>; } -def : Pat<(i32 (lround f32:$Rn)), +def : Pat<(i32 (any_lround f32:$Rn)), (!cast<Instruction>(FCVTASUWSr) f32:$Rn)>; -def : Pat<(i32 (lround f64:$Rn)), +def : Pat<(i32 (any_lround f64:$Rn)), (!cast<Instruction>(FCVTASUWDr) f64:$Rn)>; -def : Pat<(i64 (lround f32:$Rn)), +def : Pat<(i64 (any_lround f32:$Rn)), (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>; -def : Pat<(i64 (lround f64:$Rn)), +def : Pat<(i64 (any_lround f64:$Rn)), (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>; -def : Pat<(i64 (llround f32:$Rn)), +def : Pat<(i64 (any_llround f32:$Rn)), (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>; -def : Pat<(i64 (llround f64:$Rn)), +def : Pat<(i64 (any_llround f64:$Rn)), (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>; //===----------------------------------------------------------------------===// @@ -3949,20 +4055,20 @@ defm FCVT : FPConversion<"fcvt">; // Floating point single operand instructions. //===----------------------------------------------------------------------===// -defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>; -defm FMOV : SingleOperandFPData<0b0000, "fmov">; -defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>; -defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>; -defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>; -defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>; -defm FRINTN : SingleOperandFPData<0b1000, "frintn", froundeven>; -defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; +defm FABS : SingleOperandFPDataNoException<0b0001, "fabs", fabs>; +defm FMOV : SingleOperandFPDataNoException<0b0000, "fmov">; +defm FNEG : SingleOperandFPDataNoException<0b0010, "fneg", fneg>; +defm FRINTA : SingleOperandFPData<0b1100, "frinta", any_fround>; +defm FRINTI : SingleOperandFPData<0b1111, "frinti", any_fnearbyint>; +defm FRINTM : SingleOperandFPData<0b1010, "frintm", any_ffloor>; +defm FRINTN : SingleOperandFPData<0b1000, "frintn", any_froundeven>; +defm FRINTP : SingleOperandFPData<0b1001, "frintp", any_fceil>; -defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; -defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; +defm FRINTX : SingleOperandFPData<0b1110, "frintx", any_frint>; +defm FRINTZ : SingleOperandFPData<0b1011, "frintz", any_ftrunc>; let SchedRW = [WriteFDiv] in { -defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>; +defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", any_fsqrt>; } let Predicates = [HasFRInt3264] in { @@ -3972,44 +4078,48 @@ let Predicates = [HasFRInt3264] in { defm FRINT64X : FRIntNNT<0b11, "frint64x", int_aarch64_frint64x>; } // HasFRInt3264 +// Emitting strict_lrint as two instructions is valid as any exceptions that +// occur will happen in exactly one of the instructions (e.g. if the input is +// not an integer the inexact exception will happen in the FRINTX but not then +// in the FCVTZS as the output of FRINTX is an integer). let Predicates = [HasFullFP16] in { - def : Pat<(i32 (lrint f16:$Rn)), + def : Pat<(i32 (any_lrint f16:$Rn)), (FCVTZSUWHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>; - def : Pat<(i64 (lrint f16:$Rn)), + def : Pat<(i64 (any_lrint f16:$Rn)), (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>; - def : Pat<(i64 (llrint f16:$Rn)), + def : Pat<(i64 (any_llrint f16:$Rn)), (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>; } -def : Pat<(i32 (lrint f32:$Rn)), +def : Pat<(i32 (any_lrint f32:$Rn)), (FCVTZSUWSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>; -def : Pat<(i32 (lrint f64:$Rn)), +def : Pat<(i32 (any_lrint f64:$Rn)), (FCVTZSUWDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>; -def : Pat<(i64 (lrint f32:$Rn)), +def : Pat<(i64 (any_lrint f32:$Rn)), (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>; -def : Pat<(i64 (lrint f64:$Rn)), +def : Pat<(i64 (any_lrint f64:$Rn)), (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>; -def : Pat<(i64 (llrint f32:$Rn)), +def : Pat<(i64 (any_llrint f32:$Rn)), (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>; -def : Pat<(i64 (llrint f64:$Rn)), +def : Pat<(i64 (any_llrint f64:$Rn)), (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>; //===----------------------------------------------------------------------===// // Floating point two operand instructions. //===----------------------------------------------------------------------===// -defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>; +defm FADD : TwoOperandFPData<0b0010, "fadd", any_fadd>; let SchedRW = [WriteFDiv] in { -defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; +defm FDIV : TwoOperandFPData<0b0001, "fdiv", any_fdiv>; } -defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>; -defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaximum>; -defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>; -defm FMIN : TwoOperandFPData<0b0101, "fmin", fminimum>; +defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", any_fmaxnum>; +defm FMAX : TwoOperandFPData<0b0100, "fmax", any_fmaximum>; +defm FMINNM : TwoOperandFPData<0b0111, "fminnm", any_fminnum>; +defm FMIN : TwoOperandFPData<0b0101, "fmin", any_fminimum>; let SchedRW = [WriteFMul] in { -defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; -defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; +defm FMUL : TwoOperandFPData<0b0000, "fmul", any_fmul>; +defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>; } -defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; +defm FSUB : TwoOperandFPData<0b0011, "fsub", any_fsub>; def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; @@ -4024,13 +4134,13 @@ def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), // Floating point three operand instructions. //===----------------------------------------------------------------------===// -defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>; +defm FMADD : ThreeOperandFPData<0, 0, "fmadd", any_fma>; defm FMSUB : ThreeOperandFPData<0, 1, "fmsub", - TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >; + TriOpFrag<(any_fma node:$LHS, (fneg node:$MHS), node:$RHS)> >; defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd", - TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >; + TriOpFrag<(fneg (any_fma node:$LHS, node:$MHS, node:$RHS))> >; defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", - TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >; + TriOpFrag<(any_fma node:$LHS, node:$MHS, (fneg node:$RHS))> >; // The following def pats catch the case where the LHS of an FMA is negated. // The TriOpFrag above catches the case where the middle operand is negated. @@ -4159,25 +4269,25 @@ def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), (zext (v8i8 V64:$opB))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; -def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)), - (zext (extract_high_v16i8 V128:$opB))))), +def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), + (zext (extract_high_v16i8 (v16i8 V128:$opB)))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), - (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), - (zext (extract_high_v16i8 V128:$opB))), + (v8i16 (add (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), + (zext (extract_high_v16i8 (v16i8 V128:$opB)))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), (zext (v4i16 V64:$opB))))), (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; -def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)), - (zext (extract_high_v8i16 V128:$opB))))), +def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))), + (zext (extract_high_v8i16 (v8i16 V128:$opB)))))), (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)), (zext (v2i32 V64:$opB))))), (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; -def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)), - (zext (extract_high_v4i32 V128:$opB))))), +def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))), + (zext (extract_high_v4i32 (v4i32 V128:$opB)))))), (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>; @@ -4189,7 +4299,7 @@ defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>; defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>; defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>; defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>; -defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>; +defm FABS : SIMDTwoVectorFPNoException<0, 1, 0b01111, "fabs", fabs>; def : Pat<(v8i8 (AArch64vashr (v8i8 V64:$Rn), (i32 7))), (CMLTv8i8rz V64:$Rn)>; @@ -4219,9 +4329,9 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))), def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), (i64 4)))), (FCVTLv8i16 V128:$Rn)>; -def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; +def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; -def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; @@ -4233,16 +4343,16 @@ def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))), def : Pat<(concat_vectors V64:$Rd, (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; -def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; -def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; -def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))), +def : Pat<(v2f32 (any_fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; +def : Pat<(v4f16 (any_fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; +def : Pat<(concat_vectors V64:$Rd, (v2f32 (any_fpround (v2f64 V128:$Rn)))), (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>; defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", int_aarch64_neon_fcvtxn>; -defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; -defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; +defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>; // AArch64's FCVT instructions saturate when out of range. multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> { @@ -4272,15 +4382,15 @@ def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>; def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>; def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>; -defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; +defm FNEG : SIMDTwoVectorFPNoException<1, 1, 0b01111, "fneg", fneg>; defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; -defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>; -defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>; -defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>; -defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", froundeven>; -defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>; -defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>; -defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>; +defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", any_fround>; +defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", any_fnearbyint>; +defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", any_ffloor>; +defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", any_froundeven>; +defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", any_fceil>; +defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", any_frint>; +defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", any_ftrunc>; let Predicates = [HasFRInt3264] in { defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z", int_aarch64_neon_frint32z>; @@ -4290,7 +4400,7 @@ let Predicates = [HasFRInt3264] in { } // HasFRInt3264 defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>; -defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>; +defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", any_fsqrt>; defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg", UnOpFrag<(sub immAllZerosV, node:$LHS)> >; defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>; @@ -4312,9 +4422,9 @@ defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>; defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>; defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>; defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp", - BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >; -defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>; -defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>; + BinOpFrag<(add node:$LHS, (AArch64saddlp node:$RHS))> >; +defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", AArch64saddlp>; +defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", any_sint_to_fp>; defm SHLL : SIMDVectorLShiftLongBySizeBHS; defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; @@ -4324,7 +4434,7 @@ defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp", BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >; defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>; -defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>; +defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", any_uint_to_fp>; defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>; defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>; defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>; @@ -4348,15 +4458,15 @@ def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> { def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)), (SHLLv8i8 V64:$Rn)>; - def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)), + def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 (v16i8 V128:$Rn)))), (i32 8)), (SHLLv16i8 V128:$Rn)>; def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)), (SHLLv4i16 V64:$Rn)>; - def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)), + def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 (v8i16 V128:$Rn)))), (i32 16)), (SHLLv8i16 V128:$Rn)>; def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)), (SHLLv2i32 V64:$Rn)>; - def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)), + def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 (v4i32 V128:$Rn)))), (i32 32)), (SHLLv4i32 V128:$Rn)>; } @@ -4426,7 +4536,7 @@ def : Pat<(v8i16 (concat_vectors //===----------------------------------------------------------------------===// defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>; -defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>; +defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", AArch64addp>; defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>; defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>; defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; @@ -4447,33 +4557,33 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, V } defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>; -defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp", AArch64faddp>; +defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", any_fadd>; defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; -defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>; +defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", any_fdiv>; defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; -defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>; +defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", any_fmaxnum>; defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>; -defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>; +defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", any_fmaximum>; defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>; -defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>; +defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>; defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; -defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>; +defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>; // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the // instruction expects the addend first, while the fma intrinsic puts it last. defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla", - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; + TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >; defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls", - TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; + TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>; +defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", any_fmul>; defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; -defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; +defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", any_fsub>; // MLA and MLS are generated in MachineCombine defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>; @@ -4484,7 +4594,7 @@ defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >; defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>; -defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>; +defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; @@ -4496,14 +4606,14 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; -defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", avgceils>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >; defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>; -defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>; +defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; @@ -4513,7 +4623,7 @@ defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", avgceilu>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", @@ -4753,11 +4863,13 @@ defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>; defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>; def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FABD64 FPR64:$Rn, FPR64:$Rm)>; -let Predicates = [HasFullFP16] in { +let Predicates = [HasNEON, HasFullFP16] in { def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>; } +let Predicates = [HasNEON] in { def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>; def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>; +} defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge", int_aarch64_neon_facge>; defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", @@ -4765,9 +4877,9 @@ defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; -defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorStreamingSVE>; -defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorStreamingSVE>; -defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorStreamingSVE>; +defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorSME>; +defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorSME>; +defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorSME>; defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; @@ -4862,9 +4974,9 @@ defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; -defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorStreamingSVE>; -defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorStreamingSVE>; -defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorStreamingSVE>; +defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorSME>; +defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorSME>; +defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorSME>; defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", UnOpFrag<(sub immAllZerosV, node:$LHS)> >; defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>; @@ -4980,23 +5092,21 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), // int values in FP registers using the corresponding NEON instructions to // avoid more costly int <-> fp register transfers. let Predicates = [HasNEON] in { -def : Pat<(f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))), +def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))), (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>; -def : Pat<(f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))), +def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))), (SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>; -def : Pat<(f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))), +def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))), (UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>; -def : Pat<(f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))), +def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))), (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>; let Predicates = [HasFullFP16] in { -def : Pat<(f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))), +def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))), (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>; -def : Pat<(f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))), +def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } -} - // If an integer is about to be converted to a floating point value, // just load it on the floating point unit. // Here are the patterns for 8 and 16-bits to float. @@ -5083,6 +5193,7 @@ def : Pat <(f64 (uint_to_fp (i32 (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>; // 64-bits -> double are handled in target specific dag combine: // performIntToFpCombine. +} // let Predicates = [HasNEON] //===----------------------------------------------------------------------===// // Advanced SIMD three different-sized vector instructions. @@ -5102,10 +5213,10 @@ defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", BinOpFrag<(add node:$LHS, (sext node:$RHS))>>; defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; -defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>; + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>; defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", int_aarch64_neon_sqadd>; defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", @@ -5123,10 +5234,10 @@ defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>; defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; -defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>; + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>; defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", @@ -5161,74 +5272,15 @@ multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperat V64:$Rn, V64:$Rm)), dsub)>; } -defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull, +defm : Neon_mul_acc_widen_patterns<add, AArch64umull, UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; -defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull, +defm : Neon_mul_acc_widen_patterns<add, AArch64smull, SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; -defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull, +defm : Neon_mul_acc_widen_patterns<sub, AArch64umull, UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; -defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull, +defm : Neon_mul_acc_widen_patterns<sub, AArch64smull, SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; -// Additional patterns for SMULL and UMULL -multiclass Neon_mul_widen_patterns<SDPatternOperator opnode, - Instruction INST8B, Instruction INST4H, Instruction INST2S> { - def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))), - (INST8B V64:$Rn, V64:$Rm)>; - def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))), - (INST4H V64:$Rn, V64:$Rm)>; - def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))), - (INST2S V64:$Rn, V64:$Rm)>; -} - -defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16, - SMULLv4i16_v4i32, SMULLv2i32_v2i64>; -defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16, - UMULLv4i16_v4i32, UMULLv2i32_v2i64>; - -// Patterns for smull2/umull2. -multiclass Neon_mul_high_patterns<SDPatternOperator opnode, - Instruction INST8B, Instruction INST4H, Instruction INST2S> { - def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm))), - (INST8B V128:$Rn, V128:$Rm)>; - def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm))), - (INST4H V128:$Rn, V128:$Rm)>; - def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 V128:$Rm))), - (INST2S V128:$Rn, V128:$Rm)>; -} - -defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16, - SMULLv8i16_v4i32, SMULLv4i32_v2i64>; -defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16, - UMULLv8i16_v4i32, UMULLv4i32_v2i64>; - -// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL -multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode, - Instruction INST8B, Instruction INST4H, Instruction INST2S> { - def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))), - (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>; - def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))), - (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>; - def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))), - (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>; -} - -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, - SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, - UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, - SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; -defm : Neon_mulacc_widen_patterns< - TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, - UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; - // Patterns for 64-bit pmull def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), (PMULLv1i64 V64:$Rn, V64:$Rm)>; @@ -5392,19 +5444,22 @@ defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; +// Only the lower half of the result of the inner FADDP is used in the patterns +// below, so the second operand does not matter. Re-use the first input +// operand, so no additional dependencies need to be introduced. let Predicates = [HasFullFP16] in { def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))), (FADDPv2i16p (EXTRACT_SUBREG - (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))), + (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), V128:$Rn), dsub))>; def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))), - (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>; + (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>; } def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))), (FADDPv2i32p (EXTRACT_SUBREG - (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))), + (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>; def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))), (FADDPv2i32p V64:$Rn)>; @@ -5856,24 +5911,28 @@ defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>; -// Patterns for uaddv(uaddlp(x)) ==> uaddlv -def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef, - (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))), - (i64 0))), (i64 0))), - (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), - (UADDLVv8i8v V64:$op), hsub), ssub)>; -def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp - (v16i8 V128:$op))))), (i64 0))), - (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), - (UADDLVv16i8v V128:$op), hsub), ssub)>; -def : Pat<(v4i32 (AArch64uaddv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))), - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (UADDLVv8i16v V128:$op), ssub)>; - -// Patterns for addp(uaddlp(x))) ==> uaddlv -def : Pat<(v2i32 (AArch64uaddv (v2i32 (AArch64uaddlp (v4i16 V64:$op))))), - (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (UADDLVv4i16v V64:$op), ssub)>; -def : Pat<(v2i64 (AArch64uaddv (v2i64 (AArch64uaddlp (v4i32 V128:$op))))), - (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLVv4i32v V128:$op), dsub)>; +multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp> { + // Patterns for addv(addlp(x)) ==> addlv + def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef, + (v4i16 (AArch64uaddv (v4i16 (addlp (v8i8 V64:$op))))), + (i64 0))), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (!cast<Instruction>(Opc#"v8i8v") V64:$op), hsub), ssub)>; + def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (addlp (v16i8 V128:$op))))), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (!cast<Instruction>(Opc#"v16i8v") V128:$op), hsub), ssub)>; + def : Pat<(v4i32 (AArch64uaddv (v4i32 (addlp (v8i16 V128:$op))))), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v8i16v") V128:$op), ssub)>; + + // Patterns for addp(addlp(x))) ==> addlv + def : Pat<(v2i32 (AArch64uaddv (v2i32 (addlp (v4i16 V64:$op))))), + (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v4i16v") V64:$op), ssub)>; + def : Pat<(v2i64 (AArch64uaddv (v2i64 (addlp (v4i32 V128:$op))))), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v4i32v") V128:$op), dsub)>; +} + +defm : SIMDAcrossLaneLongPairIntrinsic<"UADDLV", AArch64uaddlp>; +defm : SIMDAcrossLaneLongPairIntrinsic<"SADDLV", AArch64saddlp>; // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. @@ -6185,6 +6244,14 @@ def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; let isReMaterializable = 1, isAsCheapAsAMove = 1 in defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; +let Predicates = [HasNEON] in { + // Using the MOVI to materialize fp constants. + def : Pat<(f32 fpimm32SIMDModImmType4:$in), + (EXTRACT_SUBREG (MOVIv2i32 (fpimm32SIMDModImmType4XForm f32:$in), + (i32 24)), + ssub)>; +} + def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; @@ -6273,18 +6340,18 @@ let hasSideEffects = 0 in { // On the other hand, there are quite a few valid combinatorial options due to // the commutativity of multiplication and the fact that (-x) * y = x * (-y). defm : SIMDFPIndexedTiedPatterns<"FMLA", - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; + TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)>>; defm : SIMDFPIndexedTiedPatterns<"FMLA", - TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; + TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)>>; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; + TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; + TriOpFrag<(any_fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; + TriOpFrag<(any_fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; defm : SIMDFPIndexedTiedPatterns<"FMLS", - TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; + TriOpFrag<(any_fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit @@ -6363,22 +6430,22 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { } defm : FMLSIndexedAfterNegPatterns< - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; + TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >; defm : FMLSIndexedAfterNegPatterns< - TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; + TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)> >; defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>; +defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>; -def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), +def : Pat<(v2f32 (any_fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv2i32_indexed V64:$Rn, (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), (i64 0))>; -def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))), +def : Pat<(v4f32 (any_fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv4i32_indexed V128:$Rn, (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), (i64 0))>; -def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), +def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), (FMULv2i64_indexed V128:$Rn, (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub), (i64 0))>; @@ -6397,11 +6464,10 @@ defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>; defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>; defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; -defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", - int_aarch64_neon_smull>; + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>; defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", int_aarch64_neon_sqadd>; defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", @@ -6412,11 +6478,10 @@ defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh", int_aarch64_neon_sqrdmlsh>; defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl", - TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; -defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", - int_aarch64_neon_umull>; + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", AArch64umull>; // A scalar sqdmull with the second operand being a vector lane can be // handled directly with the indexed instruction encoding. @@ -6425,22 +6490,6 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), VectorIndexS:$idx)), (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; -// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands -// have no common bits. -def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), - [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{ - if (N->getOpcode() == ISD::ADD) - return true; - return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1)); -}]> { - let GISelPredicateCode = [{ - // Only handle G_ADD for now. FIXME. build capability to compute whether - // operands of G_OR have common bits set or not. - return MI.getOpcode() == TargetOpcode::G_ADD; - }]; -} - - //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- @@ -6480,7 +6529,7 @@ def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn), def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm), (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>; -// Patterns for FP16 Instrinsics - requires reg copy to/from as i16s not supported. +// Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported. def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)), (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; @@ -6787,7 +6836,7 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST> dsub)), 0), ssub)))>, - Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>; def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; @@ -6807,7 +6856,8 @@ class SExtLoadi16CVTf32Pat<dag addrmode, dag INST> INST, hsub), 0), - ssub)))>, Requires<[NotForCodeSize]>; + ssub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>; def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; @@ -6841,7 +6891,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> dsub)), 0), dsub)))>, - Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>; def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; @@ -6860,7 +6910,8 @@ class SExtLoadi32CVTf64Pat<dag addrmode, dag INST> INST, ssub), 0), - dsub)))>, Requires<[NotForCodeSize]>; + dsub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32, HasNEON]>; def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>; @@ -7216,14 +7267,6 @@ def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0 //---------------------------------------------------------------------------- // FIXME: Like for X86, these should go in their own separate .td file. -def def32 : PatLeaf<(i32 GPR32:$src), [{ - return isDef32(*N); -}]>; - -// In the case of a 32-bit def that is known to implicitly zero-extend, -// we can use a SUBREG_TO_REG. -def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; - // For an anyext, we don't care what the high bits are, so we can perform an // INSERT_SUBREF into an IMPLICIT_DEF. def : Pat<(i64 (anyext GPR32:$src)), @@ -7387,99 +7430,16 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), // // Natural vector casts (64 bit) -def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; -def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; - -def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; -def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; - -def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; -def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; - -def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; -def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; -def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>; - -def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; -def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; +foreach VT = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, v1f64, f64 ] in + foreach VT2 = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, v1f64, f64 ] in + def : Pat<(VT (AArch64NvCast (VT2 FPR64:$src))), + (VT FPR64:$src)>; // Natural vector casts (128 bit) -def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; -def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; - -def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; -def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; - -def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; -def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; - -def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; -def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>; - -def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; -def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; - -def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; -def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; -def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; +foreach VT = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in + foreach VT2 = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in + def : Pat<(VT (AArch64NvCast (VT2 FPR128:$src))), + (VT FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; @@ -8093,17 +8053,17 @@ defm : InsertSubvectorUndef<i64>; def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)), (vector_extract (v2i64 FPR128:$Rn), (i64 1)))), (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>; -def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), - (vector_extract (v2f64 FPR128:$Rn), (i64 1)))), +def : Pat<(f64 (any_fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), + (vector_extract (v2f64 FPR128:$Rn), (i64 1)))), (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>; // vector_extract on 64-bit vectors gets promoted to a 128 bit vector, // so we match on v4f32 here, not v2f32. This will also catch adding // the low two lanes of a true v4f32 vector. -def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), - (vector_extract (v4f32 FPR128:$Rn), (i64 1))), +def : Pat<(any_fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), + (vector_extract (v4f32 FPR128:$Rn), (i64 1))), (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; -def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)), - (vector_extract (v8f16 FPR128:$Rn), (i64 1))), +def : Pat<(any_fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)), + (vector_extract (v8f16 FPR128:$Rn), (i64 1))), (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; // Scalar 64-bit shifts in FPR64 registers. diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 6aefc1fdb599..eaf39fc0dbb1 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -9,6 +9,12 @@ // This file contains a pass that performs load / store related peephole // optimizations. This pass should be run after register allocation. // +// The pass runs after the PrologEpilogInserter where we emit the CFI +// instructions. In order to preserve the correctness of the unwind informaiton, +// the pass should not change the order of any two instructions, one of which +// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix +// to unwind information. +// //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" @@ -31,6 +37,7 @@ #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -549,26 +556,6 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { } } -static bool isPairedLdSt(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - return false; - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPDi: - case AArch64::LDPQi: - case AArch64::LDPWi: - case AArch64::LDPXi: - case AArch64::STPSi: - case AArch64::STPDi: - case AArch64::STPQi: - case AArch64::STPWi: - case AArch64::STPXi: - case AArch64::STGPi: - return true; - } -} - static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { unsigned OpcA = FirstMI.getOpcode(); @@ -603,7 +590,7 @@ static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { // Returns the scale and offset range of pre/post indexed variants of MI. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset) { - bool IsPaired = isPairedLdSt(MI); + bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); bool IsTagStore = isTagStore(MI); // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. @@ -625,17 +612,8 @@ static MachineOperand &getLdStRegOp(MachineInstr &MI, bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); if (IsPreLdSt) PairedRegOp += 1; - unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2; + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; return MI.getOperand(Idx); } @@ -645,12 +623,14 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); int LoadSize = TII->getMemScale(LoadInst); int StoreSize = TII->getMemScale(StoreInst); - int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst) - ? getLdStOffsetOp(StoreInst).getImm() - : getLdStOffsetOp(StoreInst).getImm() * StoreSize; - int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst) - ? getLdStOffsetOp(LoadInst).getImm() - : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + int UnscaledStOffset = + TII->hasUnscaledLdStOffset(StoreInst) + ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() + : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = + TII->hasUnscaledLdStOffset(LoadInst) + ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() + : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize; return (UnscaledStOffset <= UnscaledLdOffset) && (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } @@ -729,7 +709,7 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) { case AArch64::STPWi: case AArch64::STPXi: // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) + if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) return false; return true; @@ -763,17 +743,18 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); + MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI) + : AArch64InstrInfo::getLdStBaseOp(*I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI; - if (getLdStOffsetOp(*I).getImm() == - getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) + if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() == + AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) RtMI = &*MergeMI; else RtMI = &*I; - int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); // Change the scaled offset from small to large type. if (IsScaled) { assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); @@ -923,6 +904,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, assert(all_of(MI.operands(), [this, &RenameReg](const MachineOperand &MOP) { return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() || + MOP.isUndef() || !TRI->regsOverlap(MOP.getReg(), *RenameReg); }) && "Rename register used between paired instruction, trashing the " @@ -936,10 +918,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); + MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired) + : AArch64InstrInfo::getLdStBaseOp(*I); - int Offset = getLdStOffsetOp(*I).getImm(); - int PairedOffset = getLdStOffsetOp(*Paired).getImm(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm(); + int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm(); bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode()); if (IsUnscaled != PairedIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. If @@ -974,7 +957,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, RtMI = &*I; Rt2MI = &*Paired; } - int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); // Scale the immediate offset, if necessary. if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) { assert(!(OffsetImm % TII->getMemScale(*RtMI)) && @@ -1132,12 +1115,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) && "Unsupported ld/st match"); assert(LoadSize <= StoreSize && "Invalid load size"); - int UnscaledLdOffset = IsUnscaled - ? getLdStOffsetOp(*LoadI).getImm() - : getLdStOffsetOp(*LoadI).getImm() * LoadSize; - int UnscaledStOffset = IsUnscaled - ? getLdStOffsetOp(*StoreI).getImm() - : getLdStOffsetOp(*StoreI).getImm() * StoreSize; + int UnscaledLdOffset = + IsUnscaled + ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() + : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize; + int UnscaledStOffset = + IsUnscaled + ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() + : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; Register DestReg = IsStoreXReg ? Register(TRI->getMatchingSuperReg( @@ -1235,7 +1220,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; MachineInstr &LoadMI = *I; - Register BaseReg = getLdStBaseOp(LoadMI).getReg(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg(); // If the load is the first instruction in the block, there's obviously // not any matching store. @@ -1264,7 +1249,8 @@ bool AArch64LoadStoreOpt::findMatchingStore( // Also we can't handle stores without an immediate offset operand, // while the operand might be the address for a global variable. if (MI.mayStore() && isMatchingStore(LoadMI, MI) && - BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() && + BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() && + AArch64InstrInfo::getLdStOffsetOp(MI).isImm() && isLdOffsetInRangeOfSt(LoadMI, MI, TII) && ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { StoreI = MBBI; @@ -1467,18 +1453,19 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, return true; } -// Check if we can find a physical register for renaming. This register must: -// * not be defined up to FirstMI (checking DefinedInBB) -// * not used between the MI and the defining instruction of the register to -// rename (checked using UsedInBetween). +// Check if we can find a physical register for renaming \p Reg. This register +// must: +// * not be defined already in \p DefinedInBB; DefinedInBB must contain all +// defined registers up to the point where the renamed register will be used, +// * not used in \p UsedInBetween; UsedInBetween must contain all accessed +// registers in the range the rename register will be used, // * is available in all used register classes (checked using RequiredClasses). static Optional<MCPhysReg> tryToFindRegisterToRename( - MachineInstr &FirstMI, MachineInstr &MI, LiveRegUnits &DefinedInBB, + const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween, SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, const TargetRegisterInfo *TRI) { - auto &MF = *FirstMI.getParent()->getParent(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); + const MachineRegisterInfo &RegInfo = MF.getRegInfo(); // Checks if any sub- or super-register of PR is callee saved. auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) { @@ -1499,7 +1486,7 @@ static Optional<MCPhysReg> tryToFindRegisterToRename( }); }; - auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); + auto *RegClass = TRI->getMinimalPhysRegClass(Reg); for (const MCPhysReg &PR : *RegClass) { if (DefinedInBB.available(PR) && UsedInBetween.available(PR) && !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) && @@ -1530,8 +1517,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); Register Reg = getLdStRegOp(FirstMI).getReg(); - Register BaseReg = getLdStBaseOp(FirstMI).getReg(); - int Offset = getLdStOffsetOp(FirstMI).getImm(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); @@ -1566,7 +1553,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, Flags.setSExtIdx(-1); if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && - getLdStOffsetOp(MI).isImm()) { + AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. @@ -1574,8 +1561,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - Register MIBaseReg = getLdStBaseOp(MI).getReg(); - int MIOffset = getLdStOffsetOp(MI).getImm(); + Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); + int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); if (IsUnscaled != MIIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. @@ -1606,15 +1593,16 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // can't be paired: bail and keep looking. if (IsPreLdSt) { bool IsOutOfBounds = MIOffset != TII->getMemScale(MI); - bool IsBaseRegUsed = - !UsedRegUnits.available(getLdStBaseOp(MI).getReg()); - bool IsBaseRegModified = - !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg()); + bool IsBaseRegUsed = !UsedRegUnits.available( + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); + bool IsBaseRegModified = !ModifiedRegUnits.available( + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); // If the stored value and the address of the second instruction is // the same, it needs to be using the updated register and therefore // it must not be folded. - bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(), - getLdStBaseOp(MI).getReg()); + bool IsMIRegTheSame = + TRI->regsOverlap(getLdStRegOp(MI).getReg(), + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified || IsMIRegTheSame) { LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, @@ -1722,8 +1710,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, if (*MaybeCanRename) { Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename( - FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses, - TRI); + *FirstMI.getParent()->getParent(), Reg, DefinedInBB, + UsedInBetween, RequiredClasses, TRI); if (MaybeRenameReg) { Flags.setRenameReg(*MaybeRenameReg); Flags.setMergeForward(true); @@ -1760,6 +1748,28 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, return E; } +static MachineBasicBlock::iterator +maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) { + auto End = MI.getParent()->end(); + if (MaybeCFI == End || + MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION || + !(MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy)) || + AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP) + return End; + + const MachineFunction &MF = *MI.getParent()->getParent(); + unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex(); + const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex]; + switch (CFI.getOperation()) { + case MCCFIInstruction::OpDefCfa: + case MCCFIInstruction::OpDefCfaOffset: + return MaybeCFI; + default: + return End; + } +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, @@ -1769,6 +1779,12 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, "Unexpected base register update instruction to merge!"); MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator NextI = next_nodbg(I, E); + + // If updating the SP and the following instruction is CFA offset related CFI + // instruction move it after the merged instruction. + MachineBasicBlock::iterator CFI = + IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E; + // Return the instruction following the merged instruction, which is // the instruction following our unmerged load. Unless that's the add/sub // instruction we're merging, in which case it's the one after that. @@ -1786,12 +1802,12 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, MachineInstrBuilder MIB; int Scale, MinOffset, MaxOffset; getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); - if (!isPairedLdSt(*I)) { + if (!AArch64InstrInfo::isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I)) - .add(getLdStBaseOp(*I)) + .add(AArch64InstrInfo::getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); @@ -1801,12 +1817,15 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I, 0)) .add(getLdStRegOp(*I, 1)) - .add(getLdStBaseOp(*I)) + .add(AArch64InstrInfo::getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); } - (void)MIB; + if (CFI != E) { + MachineBasicBlock *MBB = I->getParent(); + MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI); + } if (IsPreIdx) { ++NumPreFolded; @@ -1888,8 +1907,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; - Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); + int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * + TII->getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions // can't be formed if the memory instruction doesn't have the offset we're @@ -1904,7 +1924,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // behavior in this case unlike normal stores, and always performs writeback // after reading the source register value. if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -1965,8 +1985,8 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator MBBI = I; MachineFunction &MF = *MemMI.getMF(); - Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int Offset = getLdStOffsetOp(MemMI).getImm(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. @@ -1975,7 +1995,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // If the base register overlaps a destination register, we can't // merge the update. if (!isTagStore(MemMI)) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -2045,7 +2065,7 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( // Make sure this is a reg+imm. // FIXME: It is possible to extend it to handle reg+reg cases. - if (!getLdStOffsetOp(MI).isImm()) + if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) return false; // Look backward up to LdStLimit instructions. @@ -2099,7 +2119,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); - int Offset = getLdStOffsetOp(MI).getImm(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; // Allow one more for offset. if (Offset > 0) @@ -2166,7 +2186,8 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate // The immediate in the load/store is scaled by the size of the memory // operation. The immediate in the add we're looking for, // however, is not, so adjust here. - int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); + int UnscaledOffset = + AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] @@ -2268,7 +2289,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(Fn.getFunction())) return false; - Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + Subtarget = &Fn.getSubtarget<AArch64Subtarget>(); TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); TRI = Subtarget->getRegisterInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 1fc5617b49f6..5c7fb0deecd0 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -60,12 +60,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { MachineLoopInfo *MLI; MachineRegisterInfo *MRI; + using OpcodePair = std::pair<unsigned, unsigned>; template <typename T> using SplitAndOpcFunc = - std::function<Optional<unsigned>(T, unsigned, T &, T &)>; + std::function<Optional<OpcodePair>(T, unsigned, T &, T &)>; using BuildMIFunc = - std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register, - Register, Register)>; + std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned, + Register, Register, Register)>; /// For instructions where an immediate operand could be split into two /// separate immediate instructions, use the splitTwoPartImm two handle the @@ -83,20 +84,19 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { /// %dst = <Instr>ri %tmp (encode half IMM) [...] template <typename T> bool splitTwoPartImm(MachineInstr &MI, - SmallSetVector<MachineInstr *, 8> &ToBeRemoved, SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr); bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, MachineInstr *&SubregToRegMI); template <typename T> - bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI, - SmallSetVector<MachineInstr *, 8> &ToBeRemoved); + bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI); template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI, - SmallSetVector<MachineInstr *, 8> &ToBeRemoved); - bool visitORR(MachineInstr &MI, - SmallSetVector<MachineInstr *, 8> &ToBeRemoved); + bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); + + template <typename T> + bool visitAND(unsigned Opc, MachineInstr &MI); + bool visitORR(MachineInstr &MI); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -157,8 +157,7 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { template <typename T> bool AArch64MIPeepholeOpt::visitAND( - unsigned Opc, MachineInstr &MI, - SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { + unsigned Opc, MachineInstr &MI) { // Try below transformation. // // MOVi32imm + ANDWrr ==> ANDWri + ANDWri @@ -170,28 +169,27 @@ bool AArch64MIPeepholeOpt::visitAND( // mov + and instructions. return splitTwoPartImm<T>( - MI, ToBeRemoved, - [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> { + MI, + [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> { if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) - return Opc; + return std::make_pair(Opc, Opc); return None; }, - [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, + [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, unsigned Imm1, Register SrcReg, Register NewTmpReg, Register NewDstReg) { DebugLoc DL = MI.getDebugLoc(); MachineBasicBlock *MBB = MI.getParent(); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) .addReg(SrcReg) .addImm(Imm0); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) .addReg(NewTmpReg) .addImm(Imm1); }); } -bool AArch64MIPeepholeOpt::visitORR( - MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { +bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { // Check this ORR comes from below zero-extend pattern. // // def : Pat<(i64 (zext GPR32:$src)), @@ -216,19 +214,38 @@ bool AArch64MIPeepholeOpt::visitORR( // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is // real AArch64 instruction and if it is not, do not process the opcode // conservatively. - if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) + if (SrcMI->getOpcode() == TargetOpcode::COPY && + SrcMI->getOperand(1).getReg().isVirtual()) { + const TargetRegisterClass *RC = + MRI->getRegClass(SrcMI->getOperand(1).getReg()); + + // A COPY from an FPR will become a FMOVSWr, so do so now so that we know + // that the upper bits are zero. + if (RC != &AArch64::FPR32RegClass && + ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) || + SrcMI->getOperand(1).getSubReg() != AArch64::ssub)) + return false; + Register CpySrc = SrcMI->getOperand(1).getReg(); + if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) { + CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass); + BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), + TII->get(TargetOpcode::COPY), CpySrc) + .add(SrcMI->getOperand(1)); + } + BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(), + TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg()) + .addReg(CpySrc); + SrcMI->eraseFromParent(); + } + else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) return false; Register DefReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(2).getReg(); MRI->replaceRegWith(DefReg, SrcReg); MRI->clearKillFlags(SrcReg); - // replaceRegWith changes MI's definition register. Keep it for SSA form until - // deleting MI. - MI.getOperand(0).setReg(DefReg); - ToBeRemoved.insert(&MI); - LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n"); + MI.eraseFromParent(); return true; } @@ -255,8 +272,7 @@ static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { template <typename T> bool AArch64MIPeepholeOpt::visitADDSUB( - unsigned PosOpc, unsigned NegOpc, MachineInstr &MI, - SmallSetVector<MachineInstr *, 8> &ToBeRemoved) { + unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) { // Try below transformation. // // MOVi32imm + ADDWrr ==> ADDWri + ADDWri @@ -271,25 +287,65 @@ bool AArch64MIPeepholeOpt::visitADDSUB( // multiple `mov` + `and/sub` instructions. return splitTwoPartImm<T>( - MI, ToBeRemoved, + MI, [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> Optional<unsigned> { + T &Imm1) -> Optional<OpcodePair> { if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) - return PosOpc; + return std::make_pair(PosOpc, PosOpc); if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) - return NegOpc; + return std::make_pair(NegOpc, NegOpc); return None; }, - [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, + [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, unsigned Imm1, Register SrcReg, Register NewTmpReg, Register NewDstReg) { DebugLoc DL = MI.getDebugLoc(); MachineBasicBlock *MBB = MI.getParent(); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) .addReg(SrcReg) .addImm(Imm0) .addImm(12); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) + .addReg(NewTmpReg) + .addImm(Imm1) + .addImm(0); + }); +} + +template <typename T> +bool AArch64MIPeepholeOpt::visitADDSSUBS( + OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) { + // Try the same transformation as ADDSUB but with additional requirement + // that the condition code usages are only for Equal and Not Equal + return splitTwoPartImm<T>( + MI, + [PosOpcs, NegOpcs, &MI, &TRI = TRI, &MRI = MRI]( + T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> { + OpcodePair OP; + if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) + OP = PosOpcs; + else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) + OP = NegOpcs; + else + return None; + // Check conditional uses last since it is expensive for scanning + // proceeding instructions + MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg()); + Optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI); + if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V) + return None; + return OP; + }, + [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, + unsigned Imm1, Register SrcReg, Register NewTmpReg, + Register NewDstReg) { + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) + .addReg(SrcReg) + .addImm(Imm0) + .addImm(12); + BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) .addReg(NewTmpReg) .addImm(Imm1) .addImm(0); @@ -338,7 +394,7 @@ bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, template <typename T> bool AArch64MIPeepholeOpt::splitTwoPartImm( - MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved, + MachineInstr &MI, SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) { unsigned RegSize = sizeof(T) * 8; assert((RegSize == 32 || RegSize == 64) && @@ -357,39 +413,63 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm( // number since it was sign extended when we assign to the 64-bit Imm. if (SubregToRegMI) Imm &= 0xFFFFFFFF; - unsigned Opcode; + OpcodePair Opcode; if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) - Opcode = R.getValue(); + Opcode = *R; else return false; - // Create new ADD/SUB MIs. + // Create new MIs using the first and second opcodes. Opcodes might differ for + // flag setting operations that should only set flags on second instruction. + // NewTmpReg = Opcode.first SrcReg Imm0 + // NewDstReg = Opcode.second NewTmpReg Imm1 + + // Determine register classes for destinations and register operands MachineFunction *MF = MI.getMF(); - const TargetRegisterClass *RC = - TII->getRegClass(TII->get(Opcode), 0, TRI, *MF); - const TargetRegisterClass *ORC = - TII->getRegClass(TII->get(Opcode), 1, TRI, *MF); + const TargetRegisterClass *FirstInstrDstRC = + TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF); + const TargetRegisterClass *FirstInstrOperandRC = + TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF); + const TargetRegisterClass *SecondInstrDstRC = + (Opcode.first == Opcode.second) + ? FirstInstrDstRC + : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF); + const TargetRegisterClass *SecondInstrOperandRC = + (Opcode.first == Opcode.second) + ? FirstInstrOperandRC + : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF); + + // Get old registers destinations and new register destinations Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - Register NewTmpReg = MRI->createVirtualRegister(RC); - Register NewDstReg = MRI->createVirtualRegister(RC); - - MRI->constrainRegClass(SrcReg, RC); - MRI->constrainRegClass(NewTmpReg, ORC); - MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); - + Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC); + // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to + // reuse that same destination register. + Register NewDstReg = DstReg.isVirtual() + ? MRI->createVirtualRegister(SecondInstrDstRC) + : DstReg; + + // Constrain registers based on their new uses + MRI->constrainRegClass(SrcReg, FirstInstrOperandRC); + MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC); + if (DstReg != NewDstReg) + MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); + + // Call the delegating operation to build the instruction BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); - MRI->replaceRegWith(DstReg, NewDstReg); // replaceRegWith changes MI's definition register. Keep it for SSA form until - // deleting MI. - MI.getOperand(0).setReg(DstReg); + // deleting MI. Only if we made a new destination register. + if (DstReg != NewDstReg) { + MRI->replaceRegWith(DstReg, NewDstReg); + MI.getOperand(0).setReg(DstReg); + } // Record the MIs need to be removed. - ToBeRemoved.insert(&MI); + MI.eraseFromParent(); if (SubregToRegMI) - ToBeRemoved.insert(SubregToRegMI); - ToBeRemoved.insert(MovMI); + SubregToRegMI->eraseFromParent(); + MovMI->eraseFromParent(); return true; } @@ -407,45 +487,57 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { assert(MRI->isSSA() && "Expected to be run on SSA form!"); bool Changed = false; - SmallSetVector<MachineInstr *, 8> ToBeRemoved; for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { + for (MachineInstr &MI : make_early_inc_range(MBB)) { switch (MI.getOpcode()) { default: break; case AArch64::ANDWrr: - Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved); + Changed = visitAND<uint32_t>(AArch64::ANDWri, MI); break; case AArch64::ANDXrr: - Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved); + Changed = visitAND<uint64_t>(AArch64::ANDXri, MI); break; case AArch64::ORRWrs: - Changed = visitORR(MI, ToBeRemoved); + Changed = visitORR(MI); break; case AArch64::ADDWrr: - Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI, - ToBeRemoved); + Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI); break; case AArch64::SUBWrr: - Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI, - ToBeRemoved); + Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI); break; case AArch64::ADDXrr: - Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI, - ToBeRemoved); + Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI); break; case AArch64::SUBXrr: - Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI, - ToBeRemoved); + Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI); + break; + case AArch64::ADDSWrr: + Changed = visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri}, + {AArch64::SUBWri, AArch64::SUBSWri}, + MI); + break; + case AArch64::SUBSWrr: + Changed = visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri}, + {AArch64::ADDWri, AArch64::ADDSWri}, + MI); + break; + case AArch64::ADDSXrr: + Changed = visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri}, + {AArch64::SUBXri, AArch64::SUBSXri}, + MI); + break; + case AArch64::SUBSXrr: + Changed = visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri}, + {AArch64::ADDXri, AArch64::ADDSXri}, + MI); break; } } } - for (MachineInstr *MI : ToBeRemoved) - MI->eraseFromParent(); - return Changed; } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index 6950675c5d53..a2ab2b855d80 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -15,8 +15,11 @@ #include "AArch64MachineFunctionInfo.h" #include "AArch64InstrInfo.h" -#include <llvm/IR/Metadata.h> -#include <llvm/IR/Module.h> +#include "AArch64Subtarget.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/MCAsmInfo.h" using namespace llvm; @@ -30,7 +33,7 @@ void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) { void AArch64FunctionInfo::initializeBaseYamlFields( const yaml::AArch64FunctionInfo &YamlMFI) { - if (YamlMFI.HasRedZone.hasValue()) + if (YamlMFI.HasRedZone) HasRedZone = YamlMFI.HasRedZone; } @@ -77,15 +80,17 @@ static bool ShouldSignWithBKey(const Function &F) { return Key.equals_insensitive("b_key"); } -AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) { +AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF_) : MF(&MF_) { // If we already know that the function doesn't have a redzone, set // HasRedZone here. - if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) + if (MF->getFunction().hasFnAttribute(Attribute::NoRedZone)) HasRedZone = false; - const Function &F = MF.getFunction(); + const Function &F = MF->getFunction(); std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F); SignWithBKey = ShouldSignWithBKey(F); + // TODO: skip functions that have no instrumented allocas for optimization + IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag); if (!F.hasFnAttribute("branch-target-enforcement")) { if (const auto *BTE = mdconst::extract_or_null<ConstantInt>( @@ -101,6 +106,15 @@ AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) { BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); } +MachineFunctionInfo *AArch64FunctionInfo::clone( + BumpPtrAllocator &Allocator, MachineFunction &DestMF, + const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) + const { + AArch64FunctionInfo *InfoClone = DestMF.cloneInfo<AArch64FunctionInfo>(*this); + InfoClone->MF = &DestMF; + return InfoClone; +} + bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const { if (!SignReturnAddress) return false; @@ -111,6 +125,27 @@ bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const { bool AArch64FunctionInfo::shouldSignReturnAddress() const { return shouldSignReturnAddress(llvm::any_of( - MF.getFrameInfo().getCalleeSavedInfo(), + MF->getFrameInfo().getCalleeSavedInfo(), [](const auto &Info) { return Info.getReg() == AArch64::LR; })); } + +bool AArch64FunctionInfo::needsDwarfUnwindInfo() const { + if (!NeedsDwarfUnwindInfo) + NeedsDwarfUnwindInfo = MF->needsFrameMoves() && + !MF->getTarget().getMCAsmInfo()->usesWindowsCFI(); + + return *NeedsDwarfUnwindInfo; +} + +bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo() const { + if (!NeedsAsyncDwarfUnwindInfo) { + const Function &F = MF->getFunction(); + // The check got "minsize" is because epilogue unwind info is not emitted + // (yet) for homogeneous epilogues, outlined functions, and functions + // outlined from. + NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo() && + F.getUWTableKind() == UWTableKind::Async && + !F.hasMinSize(); + } + return *NeedsAsyncDwarfUnwindInfo; +} diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index e5e08e6c00d6..f070f989a5b7 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MIRYamlMapping.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCLinkerOptimizationHint.h" @@ -36,7 +37,7 @@ class MachineInstr; /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { /// Backreference to the machine function. - MachineFunction &MF; + MachineFunction *MF; /// Number of bytes of arguments this function has on the stack. If the callee /// is expected to restore the argument stack this should be a multiple of 16, @@ -115,7 +116,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// SRetReturnReg - sret lowering includes returning the value of the /// returned struct in a register. This field holds the virtual register into /// which the sret argument is passed. - unsigned SRetReturnReg = 0; + Register SRetReturnReg; + /// SVE stack size (for predicates and data vectors) are maintained here /// rather than in FrameInfo, as the placement and Stack IDs are target /// specific. @@ -173,9 +175,29 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// The stack slot where the Swift asynchronous context is stored. int SwiftAsyncContextFrameIdx = std::numeric_limits<int>::max(); + bool IsMTETagged = false; + + /// The function has Scalable Vector or Scalable Predicate register argument + /// or return type + bool IsSVECC = false; + + /// True if the function need unwind information. + mutable Optional<bool> NeedsDwarfUnwindInfo; + + /// True if the function need asynchronous unwind information. + mutable Optional<bool> NeedsAsyncDwarfUnwindInfo; + public: explicit AArch64FunctionInfo(MachineFunction &MF); + MachineFunctionInfo * + clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, + const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) + const override; + + bool isSVECC() const { return IsSVECC; }; + void setIsSVECC(bool s) { IsSVECC = s; }; + void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } @@ -395,6 +417,7 @@ public: bool shouldSignReturnAddress(bool SpillsLR) const; bool shouldSignWithBKey() const { return SignWithBKey; } + bool isMTETagged() const { return IsMTETagged; } bool branchTargetEnforcement() const { return BranchTargetEnforcement; } @@ -408,6 +431,9 @@ public: } int getSwiftAsyncContextFrameIdx() const { return SwiftAsyncContextFrameIdx; } + bool needsDwarfUnwindInfo() const; + bool needsAsyncDwarfUnwindInfo() const; + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp new file mode 100644 index 000000000000..6c8845ee8598 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp @@ -0,0 +1,82 @@ +//===- AArch64MachineScheduler.cpp - MI Scheduler for AArch64 -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AArch64MachineScheduler.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" + +using namespace llvm; + +static bool needReorderStoreMI(const MachineInstr *MI) { + if (!MI) + return false; + + switch (MI->getOpcode()) { + default: + return false; + case AArch64::STURQi: + case AArch64::STRQui: + if (MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend()) + return false; + LLVM_FALLTHROUGH; + case AArch64::STPQi: + return AArch64InstrInfo::getLdStOffsetOp(*MI).isImm(); + } + + return false; +} + +// Return true if two stores with same base address may overlap writes +static bool mayOverlapWrite(const MachineInstr &MI0, const MachineInstr &MI1, + int64_t &Off0, int64_t &Off1) { + const MachineOperand &Base0 = AArch64InstrInfo::getLdStBaseOp(MI0); + const MachineOperand &Base1 = AArch64InstrInfo::getLdStBaseOp(MI1); + + // May overlapping writes if two store instructions without same base + if (!Base0.isIdenticalTo(Base1)) + return true; + + int StoreSize0 = AArch64InstrInfo::getMemScale(MI0); + int StoreSize1 = AArch64InstrInfo::getMemScale(MI1); + Off0 = AArch64InstrInfo::hasUnscaledLdStOffset(MI0.getOpcode()) + ? AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() + : AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() * StoreSize0; + Off1 = AArch64InstrInfo::hasUnscaledLdStOffset(MI1.getOpcode()) + ? AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() + : AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() * StoreSize1; + + const MachineInstr &MI = (Off0 < Off1) ? MI0 : MI1; + int Multiples = AArch64InstrInfo::isPairedLdSt(MI) ? 2 : 1; + int StoreSize = AArch64InstrInfo::getMemScale(MI) * Multiples; + + return llabs(Off0 - Off1) < StoreSize; +} + +bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand) { + bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand); + + if (Cand.isValid()) { + MachineInstr *Instr0 = TryCand.SU->getInstr(); + MachineInstr *Instr1 = Cand.SU->getInstr(); + + if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1)) + return OriginalResult; + + int64_t Off0, Off1; + // With the same base address and non-overlapping writes. + if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) { + TryCand.Reason = NodeOrder; + // Order them by ascending offsets. + return Off0 < Off1; + } + } + + return OriginalResult; +} diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.h b/llvm/lib/Target/AArch64/AArch64MachineScheduler.h new file mode 100644 index 000000000000..23df015986d1 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.h @@ -0,0 +1,33 @@ +//===- AArch64MachineScheduler.h - Custom AArch64 MI scheduler --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Custom AArch64 MI scheduler. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINESCHEDULER_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +/// A MachineSchedStrategy implementation for AArch64 post RA scheduling. +class AArch64PostRASchedStrategy : public PostGenericScheduler { +public: + AArch64PostRASchedStrategy(const MachineSchedContext *C) : + PostGenericScheduler(C) {} + +protected: + bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override; +}; + +} // end namespace llvm + +#endif + diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp index e8217eaf6ed5..c7657f37d16d 100644 --- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -157,16 +157,19 @@ static bool isCryptoEORPair(const MachineInstr *FirstMI, return false; } -/// Literal generation. -static bool isLiteralsPair(const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { +static bool isAdrpAddPair(const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { // Assume the 1st instr to be a wildcard if it is unspecified. - - // PC relative address. if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::ADRP) && SecondMI.getOpcode() == AArch64::ADDXri) return true; + return false; +} +/// Literal generation. +static bool isLiteralsPair(const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + // Assume the 1st instr to be a wildcard if it is unspecified. // 32 bit immediate. if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVZWi) && (SecondMI.getOpcode() == AArch64::MOVKWi && @@ -397,6 +400,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, return true; if (ST.hasFuseCryptoEOR() && isCryptoEORPair(FirstMI, SecondMI)) return true; + if (ST.hasFuseAdrpAdd() && isAdrpAddPair(FirstMI, SecondMI)) + return true; if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI)) return true; if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI)) diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index f443cd03935c..4555f1a3ebb0 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -14,6577 +14,6608 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H #define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H +#include "llvm/ADT/ArrayRef.h" + // 31 entries have cost 0 -// 242 entries have cost 1 -// 1447 entries have cost 2 -// 3602 entries have cost 3 -// 1237 entries have cost 4 -// 2 entries have cost 5 +// 756 entries have cost 1 +// 3690 entries have cost 2 +// 2084 entries have cost 3 // This table is 6561*4 = 26244 bytes in size. -static const unsigned PerfectShuffleTable[6561+1] = { - 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS - 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS - 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> - 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS - 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> - 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> - 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS - 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> - 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS - 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> - 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> - 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> - 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> - 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> - 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS - 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> - 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> - 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS - 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> - 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> - 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> - 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> - 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS - 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> - 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> - 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> - 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> - 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> - 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> - 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> - 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> - 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> - 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> - 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> - 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS - 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> - 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> - 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> - 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> - 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> - 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> - 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> - 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> - 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> - 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> - 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS - 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> - 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> - 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS - 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> - 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> - 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> - 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> - 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> - 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> - 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> - 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> - 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> - 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> - 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> - 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS - 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS - 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS - 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> - 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS - 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> - 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS - 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> - 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> - 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> - 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> - 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> - 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> - 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> - 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS - 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> - 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> - 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> - 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> - 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS - 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> - 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> - 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> - 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS - 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS - 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> - 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> - 835584U, // <0,1,2,3>: Cost 0 copy LHS - 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS - 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> - 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> - 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> - 835584U, // <0,1,2,u>: Cost 0 copy LHS - 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> - 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> - 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> - 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> - 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS - 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> - 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> - 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> - 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> - 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS - 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> - 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> - 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> - 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS - 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> - 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS - 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> - 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> - 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> - 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> - 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> - 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> - 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> - 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> - 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> - 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS - 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> - 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> - 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> - 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS - 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> - 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> - 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> - 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> - 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> - 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> - 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> - 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> - 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> - 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> - 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> - 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> - 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> - 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS - 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS - 835584U, // <0,1,u,3>: Cost 0 copy LHS - 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS - 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> - 835584U, // <0,1,u,u>: Cost 0 copy LHS - 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> - 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS - 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> - 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> - 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> - 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> - 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> - 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS - 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> - 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> - 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> - 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS - 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> - 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> - 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> - 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> - 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> - 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> - 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> - 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS - 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> - 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> - 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> - 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS - 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> - 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> - 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> - 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> - 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> - 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS - 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> - 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> - 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> - 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS - 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS - 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS - 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> - 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> - 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> - 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> - 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> - 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> - 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> - 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS - 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> - 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> - 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> - 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> - 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> - 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> - 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> - 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> - 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> - 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> - 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> - 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> - 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> - 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> - 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> - 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> - 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> - 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS - 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS - 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS - 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS - 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> - 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> - 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> - 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> - 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS - 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> - 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> - 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> - 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS - 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> - 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> - 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> - 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> - 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> - 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> - 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> - 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> - 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> - 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS - 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> - 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> - 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS - 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> - 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> - 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS - 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> - 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> - 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> - 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> - 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> - 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> - 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> - 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> - 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> - 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> - 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> - 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> - 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> - 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> - 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> - 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS - 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> - 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> - 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS - 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> - 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> - 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> - 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> - 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> - 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> - 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> - 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> - 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> - 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> - 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> - 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> - 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> - 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> - 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> - 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> - 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> - 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> - 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> - 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> - 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> - 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> - 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> - 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> - 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> - 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> - 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS - 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> - 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> - 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS - 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> - 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> - 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS - 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> - 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> - 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> - 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> - 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> - 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> - 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS - 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS - 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> - 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> - 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> - 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS - 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS - 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS - 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS - 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS - 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> - 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> - 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> - 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS - 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS - 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS - 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS - 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> - 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> - 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> - 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> - 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> - 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> - 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS - 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> - 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> - 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> - 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> - 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> - 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> - 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> - 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS - 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS - 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> - 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS - 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS - 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> - 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> - 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> - 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS - 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> - 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> - 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> - 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> - 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> - 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> - 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> - 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> - 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> - 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> - 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> - 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS - 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> - 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> - 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> - 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS - 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> - 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> - 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS - 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS - 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> - 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS - 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS - 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS - 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> - 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS - 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> - 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS - 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> - 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> - 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS - 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> - 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> - 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS - 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS - 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS - 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> - 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> - 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> - 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> - 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> - 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> - 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> - 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS - 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS - 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> - 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> - 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> - 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS - 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> - 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> - 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS - 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS - 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> - 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> - 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> - 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> - 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> - 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> - 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> - 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> - 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> - 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> - 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> - 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> - 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> - 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> - 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS - 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> - 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> - 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS - 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> - 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> - 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> - 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> - 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> - 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> - 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> - 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> - 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> - 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS - 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> - 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> - 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> - 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS - 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> - 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> - 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> - 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> - 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS - 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> - 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> - 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> - 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS - 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> - 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> - 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> - 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS - 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS - 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS - 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> - 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> - 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> - 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS - 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> - 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> - 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> - 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS - 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> - 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> - 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> - 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> - 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> - 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS - 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS - 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS - 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> - 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> - 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> - 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS - 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> - 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> - 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS - 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS - 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> - 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> - 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> - 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> - 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> - 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> - 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS - 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS - 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> - 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> - 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> - 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> - 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> - 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> - 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> - 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> - 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> - 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS - 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> - 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> - 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> - 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS - 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> - 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS - 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS - 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> - 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> - 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> - 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> - 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> - 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> - 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS - 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> - 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> - 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> - 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> - 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> - 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> - 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> - 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> - 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> - 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> - 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> - 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> - 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> - 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> - 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> - 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> - 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> - 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> - 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS - 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> - 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> - 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> - 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> - 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS - 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS - 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> - 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS - 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> - 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> - 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> - 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> - 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> - 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> - 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS - 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> - 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> - 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> - 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> - 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> - 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> - 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> - 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> - 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> - 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS - 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> - 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> - 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> - 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS - 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> - 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> - 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> - 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> - 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> - 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> - 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> - 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> - 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> - 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> - 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> - 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> - 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS - 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> - 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> - 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> - 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> - 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS - 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> - 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> - 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS - 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> - 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> - 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> - 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> - 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> - 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> - 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> - 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> - 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> - 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> - 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> - 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> - 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> - 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS - 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> - 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> - 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> - 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> - 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> - 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> - 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> - 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> - 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS - 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> - 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> - 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> - 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> - 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> - 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> - 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> - 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> - 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> - 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS - 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> - 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> - 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> - 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS - 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS - 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> - 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS - 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> - 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> - 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS - 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> - 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS - 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS - 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS - 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> - 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS - 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS - 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> - 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS - 835584U, // <0,u,2,3>: Cost 0 copy LHS - 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS - 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> - 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS - 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> - 835584U, // <0,u,2,u>: Cost 0 copy LHS - 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> - 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> - 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> - 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> - 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS - 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> - 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS - 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS - 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> - 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS - 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS - 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> - 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> - 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> - 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS - 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> - 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS - 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS - 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> - 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> - 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> - 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS - 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> - 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> - 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> - 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> - 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS - 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> - 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> - 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> - 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS - 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> - 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> - 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> - 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS - 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS - 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS - 835584U, // <0,u,u,3>: Cost 0 copy LHS - 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS - 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS - 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> - 835584U, // <0,u,u,u>: Cost 0 copy LHS - 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> - 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> - 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> - 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> - 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> - 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> - 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> - 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> - 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> - 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS - 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> - 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> - 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS - 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> - 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> - 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> - 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS - 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> - 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> - 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> - 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> - 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> - 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> - 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> - 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> - 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> - 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> - 67944550U, // <1,0,3,2>: Cost 1 vrev LHS - 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> - 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS - 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> - 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> - 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> - 68386972U, // <1,0,3,u>: Cost 1 vrev LHS - 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> - 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> - 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> - 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> - 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> - 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS - 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> - 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> - 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS - 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> - 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS - 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> - 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> - 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> - 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> - 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS - 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS - 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> - 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> - 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> - 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> - 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> - 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> - 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> - 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> - 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> - 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> - 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> - 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> - 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> - 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> - 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> - 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> - 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> - 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> - 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> - 67985515U, // <1,0,u,2>: Cost 1 vrev LHS - 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> - 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> - 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS - 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> - 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> - 68427937U, // <1,0,u,u>: Cost 1 vrev LHS - 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> - 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS - 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> - 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> - 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> - 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> - 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> - 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> - 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> - 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS - 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> - 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> - 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> - 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> - 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS - 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> - 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> - 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> - 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> - 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS - 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> - 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> - 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> - 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> - 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> - 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> - 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> - 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS - 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> - 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> - 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> - 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> - 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS - 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS - 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> - 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> - 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> - 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS - 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS - 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> - 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS - 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> - 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> - 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> - 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> - 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> - 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> - 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> - 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> - 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> - 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> - 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> - 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> - 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> - 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> - 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> - 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> - 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> - 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> - 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> - 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS - 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> - 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> - 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> - 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> - 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS - 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> - 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS - 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS - 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> - 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS - 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> - 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS - 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> - 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> - 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> - 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> - 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> - 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> - 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS - 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> - 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> - 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> - 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS - 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS - 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> - 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> - 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> - 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS - 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> - 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> - 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> - 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> - 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> - 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> - 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> - 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> - 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> - 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS - 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> - 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS - 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> - 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS - 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> - 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> - 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> - 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> - 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> - 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS - 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> - 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> - 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS - 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS - 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> - 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> - 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS - 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> - 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> - 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> - 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS - 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> - 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> - 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> - 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> - 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> - 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> - 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> - 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> - 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> - 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> - 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> - 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> - 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> - 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> - 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> - 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> - 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> - 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> - 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS - 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> - 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS - 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS - 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS - 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> - 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS - 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> - 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> - 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> - 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> - 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> - 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> - 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS - 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> - 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> - 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> - 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS - 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS - 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> - 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> - 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> - 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS - 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> - 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> - 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> - 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS - 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> - 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> - 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> - 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS - 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> - 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> - 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> - 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS - 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS - 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> - 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> - 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> - 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS - 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> - 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> - 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS - 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS - 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> - 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> - 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> - 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS - 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> - 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> - 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS - 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS - 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> - 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> - 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> - 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> - 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> - 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> - 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> - 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> - 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> - 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> - 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> - 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> - 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> - 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> - 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> - 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> - 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS - 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> - 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> - 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS - 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS - 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> - 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS - 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS - 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> - 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS - 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> - 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> - 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> - 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> - 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> - 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> - 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> - 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> - 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> - 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> - 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> - 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS - 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> - 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS - 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> - 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> - 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> - 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> - 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> - 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> - 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> - 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS - 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS - 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> - 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> - 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> - 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> - 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> - 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> - 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> - 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> - 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> - 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> - 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> - 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> - 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> - 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS - 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> - 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> - 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS - 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS - 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> - 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> - 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> - 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS - 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS - 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS - 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> - 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> - 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> - 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS - 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> - 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> - 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS - 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> - 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> - 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> - 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> - 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS - 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> - 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> - 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> - 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> - 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS - 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS - 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> - 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> - 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> - 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> - 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> - 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> - 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> - 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> - 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> - 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> - 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS - 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> - 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> - 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> - 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> - 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> - 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> - 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> - 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS - 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> - 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> - 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> - 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> - 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> - 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> - 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> - 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> - 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS - 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> - 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> - 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> - 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> - 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> - 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> - 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> - 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> - 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS - 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS - 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> - 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> - 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> - 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> - 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS - 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> - 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS - 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS - 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> - 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> - 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> - 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS - 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> - 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> - 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> - 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS - 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> - 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> - 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> - 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> - 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> - 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> - 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> - 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> - 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> - 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS - 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> - 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> - 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> - 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS - 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> - 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> - 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> - 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS - 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> - 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> - 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> - 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> - 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> - 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS - 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS - 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> - 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS - 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> - 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> - 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> - 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> - 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> - 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS - 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS - 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> - 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> - 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> - 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> - 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> - 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> - 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> - 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS - 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> - 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> - 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> - 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> - 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS - 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> - 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> - 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS - 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS - 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> - 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> - 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> - 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS - 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> - 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> - 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> - 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> - 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> - 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> - 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> - 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> - 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS - 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS - 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS - 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS - 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> - 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> - 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> - 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> - 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> - 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> - 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> - 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS - 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS - 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> - 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> - 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> - 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> - 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS - 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> - 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> - 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> - 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> - 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> - 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> - 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> - 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> - 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> - 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> - 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> - 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS - 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> - 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> - 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS - 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> - 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> - 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> - 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS - 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> - 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> - 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> - 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS - 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> - 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> - 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> - 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> - 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> - 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> - 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS - 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS - 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> - 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> - 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> - 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS - 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> - 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> - 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> - 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> - 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS - 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> - 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> - 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> - 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS - 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> - 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> - 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> - 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS - 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS - 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> - 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> - 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> - 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS - 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> - 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> - 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> - 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS - 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> - 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> - 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> - 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> - 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> - 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS - 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> - 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> - 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS - 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS - 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> - 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> - 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> - 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS - 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> - 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> - 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS - 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS - 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> - 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> - 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> - 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> - 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS - 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> - 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> - 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> - 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> - 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> - 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> - 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> - 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> - 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS - 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> - 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> - 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> - 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> - 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS - 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS - 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> - 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> - 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS - 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> - 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> - 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> - 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS - 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> - 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS - 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> - 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> - 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> - 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> - 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> - 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> - 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS - 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS - 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS - 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> - 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS - 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS - 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS - 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 115726126U, // <1,u,3,2>: Cost 1 vrev LHS - 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS - 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS - 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> - 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS - 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS - 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> - 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> - 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> - 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> - 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS - 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS - 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> - 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> - 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS - 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS - 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> - 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS - 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS - 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS - 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> - 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> - 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> - 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> - 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> - 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> - 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> - 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> - 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> - 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> - 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> - 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS - 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> - 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> - 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> - 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> - 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS - 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS - 115767091U, // <1,u,u,2>: Cost 1 vrev LHS - 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS - 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS - 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS - 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS - 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS - 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> - 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> - 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> - 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> - 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS - 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> - 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> - 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> - 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> - 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> - 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> - 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS - 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS - 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> - 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> - 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> - 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS - 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS - 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> - 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> - 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> - 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS - 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> - 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> - 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> - 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS - 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> - 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> - 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> - 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> - 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS - 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> - 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> - 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> - 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> - 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS - 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> - 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> - 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> - 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS - 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> - 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> - 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS - 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> - 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> - 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> - 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> - 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> - 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> - 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> - 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> - 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS - 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> - 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> - 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> - 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> - 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> - 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> - 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> - 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> - 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> - 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> - 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> - 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> - 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> - 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> - 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS - 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> - 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> - 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS - 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS - 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS - 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS - 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> - 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> - 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS - 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> - 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> - 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> - 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> - 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS - 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> - 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> - 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> - 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS - 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> - 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> - 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> - 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> - 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS - 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> - 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> - 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> - 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS - 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> - 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> - 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> - 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> - 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS - 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> - 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> - 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS - 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> - 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> - 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> - 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> - 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> - 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> - 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS - 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS - 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS - 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> - 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> - 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS - 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> - 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> - 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> - 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS - 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> - 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> - 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS - 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> - 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS - 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> - 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> - 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS - 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS - 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> - 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> - 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> - 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS - 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> - 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> - 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> - 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> - 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> - 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> - 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> - 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> - 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> - 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS - 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> - 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> - 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS - 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> - 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> - 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> - 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS - 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> - 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> - 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS - 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> - 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> - 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> - 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> - 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> - 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> - 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> - 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS - 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS - 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> - 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> - 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> - 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS - 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS - 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> - 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS - 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> - 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS - 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> - 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> - 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS - 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> - 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> - 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> - 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS - 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> - 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> - 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> - 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS - 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS - 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> - 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> - 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> - 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS - 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS - 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> - 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS - 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> - 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> - 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> - 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS - 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> - 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> - 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> - 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS - 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> - 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> - 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> - 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> - 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS - 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> - 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> - 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> - 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> - 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> - 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> - 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> - 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS - 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> - 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> - 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> - 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> - 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> - 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS - 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS - 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS - 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS - 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS - 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS - 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS - 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> - 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> - 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> - 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS - 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS - 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> - 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> - 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> - 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> - 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> - 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> - 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> - 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> - 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> - 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> - 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> - 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> - 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> - 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> - 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> - 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> - 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS - 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> - 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> - 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> - 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS - 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS - 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> - 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS - 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS - 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> - 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> - 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> - 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> - 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> - 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> - 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> - 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> - 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> - 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> - 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> - 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> - 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> - 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> - 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS - 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> - 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> - 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> - 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS - 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> - 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> - 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS - 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> - 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> - 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> - 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> - 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> - 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> - 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> - 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> - 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> - 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS - 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> - 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> - 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> - 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> - 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS - 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> - 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> - 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> - 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> - 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> - 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS - 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS - 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> - 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS - 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> - 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> - 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> - 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> - 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> - 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> - 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> - 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> - 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> - 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> - 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> - 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> - 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> - 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> - 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS - 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS - 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> - 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS - 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> - 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> - 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> - 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> - 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS - 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> - 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS - 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> - 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> - 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> - 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS - 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> - 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> - 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS - 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> - 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> - 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> - 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> - 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> - 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> - 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> - 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS - 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> - 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> - 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS - 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> - 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS - 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> - 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> - 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> - 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> - 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> - 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> - 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS - 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> - 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> - 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> - 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> - 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> - 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> - 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> - 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> - 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> - 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> - 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> - 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> - 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> - 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> - 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> - 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> - 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS - 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS - 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> - 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> - 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> - 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> - 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> - 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> - 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS - 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> - 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> - 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> - 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS - 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS - 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> - 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS - 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS - 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> - 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> - 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> - 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS - 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> - 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> - 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> - 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> - 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS - 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> - 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> - 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> - 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> - 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> - 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> - 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS - 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS - 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS - 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> - 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> - 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> - 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS - 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> - 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> - 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS - 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS - 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS - 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> - 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> - 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> - 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS - 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS - 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> - 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> - 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> - 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> - 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> - 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> - 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> - 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS - 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> - 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> - 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> - 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> - 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> - 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> - 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> - 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS - 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> - 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> - 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> - 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> - 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> - 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> - 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> - 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS - 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> - 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> - 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> - 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> - 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> - 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> - 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> - 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> - 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS - 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS - 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> - 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> - 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> - 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> - 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> - 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> - 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS - 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS - 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> - 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> - 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> - 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> - 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> - 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> - 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> - 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS - 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS - 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS - 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> - 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> - 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> - 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS - 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> - 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> - 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS - 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS - 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> - 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> - 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> - 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> - 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> - 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> - 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> - 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> - 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> - 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> - 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> - 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> - 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> - 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS - 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS - 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> - 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> - 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> - 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> - 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> - 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> - 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> - 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> - 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> - 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> - 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> - 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> - 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> - 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> - 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> - 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> - 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> - 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> - 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> - 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> - 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> - 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> - 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> - 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> - 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> - 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS - 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> - 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> - 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> - 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS - 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> - 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> - 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> - 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS - 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> - 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> - 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> - 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> - 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> - 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS - 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> - 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> - 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS - 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> - 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> - 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> - 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> - 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS - 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> - 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> - 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> - 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS - 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS - 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> - 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> - 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> - 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS - 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> - 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> - 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> - 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS - 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> - 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> - 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> - 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> - 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS - 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> - 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> - 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> - 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> - 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS - 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> - 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> - 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> - 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS - 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS - 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> - 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> - 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS - 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS - 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> - 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> - 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS - 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> - 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> - 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> - 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> - 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS - 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS - 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS - 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS - 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> - 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> - 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS - 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> - 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> - 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS - 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS - 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS - 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> - 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> - 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> - 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS - 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS - 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS - 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> - 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> - 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> - 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS - 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> - 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> - 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS - 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS - 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> - 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> - 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> - 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS - 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS - 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS - 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS - 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS - 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS - 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS - 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS - 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> - 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> - 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> - 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> - 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> - 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> - 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> - 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> - 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS - 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> - 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS - 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> - 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS - 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> - 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> - 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> - 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> - 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> - 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> - 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> - 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> - 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> - 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> - 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> - 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> - 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> - 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> - 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> - 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> - 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> - 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> - 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> - 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> - 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> - 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> - 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> - 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> - 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> - 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> - 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> - 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> - 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> - 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS - 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> - 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> - 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> - 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> - 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> - 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS - 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> - 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> - 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> - 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> - 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> - 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> - 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> - 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> - 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> - 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> - 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> - 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> - 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> - 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> - 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> - 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> - 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> - 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> - 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> - 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> - 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS - 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> - 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> - 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> - 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> - 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS - 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS - 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS - 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> - 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> - 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS - 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> - 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> - 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> - 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> - 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> - 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> - 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> - 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> - 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> - 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> - 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> - 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> - 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> - 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> - 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> - 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> - 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> - 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS - 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> - 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> - 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> - 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> - 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS - 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> - 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> - 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> - 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS - 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> - 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> - 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> - 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> - 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS - 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> - 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> - 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> - 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS - 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS - 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS - 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> - 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> - 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> - 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> - 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> - 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> - 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> - 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> - 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> - 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> - 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> - 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> - 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> - 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> - 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> - 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> - 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> - 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> - 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> - 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS - 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> - 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> - 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS - 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> - 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> - 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> - 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS - 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS - 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> - 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> - 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> - 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS - 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> - 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> - 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> - 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> - 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS - 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> - 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> - 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> - 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> - 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> - 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> - 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS - 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> - 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> - 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> - 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> - 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS - 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> - 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> - 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> - 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> - 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> - 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> - 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> - 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> - 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> - 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> - 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> - 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> - 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> - 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> - 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> - 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> - 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> - 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> - 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> - 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> - 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> - 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS - 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> - 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> - 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> - 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS - 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> - 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> - 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS - 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS - 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> - 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> - 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> - 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> - 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> - 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> - 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> - 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> - 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> - 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> - 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> - 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> - 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> - 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> - 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> - 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> - 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> - 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> - 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> - 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> - 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS - 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> - 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> - 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> - 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> - 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS - 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> - 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS - 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> - 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> - 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> - 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> - 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> - 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> - 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> - 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> - 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> - 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> - 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> - 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> - 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> - 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> - 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> - 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> - 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> - 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> - 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS - 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> - 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> - 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> - 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> - 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS - 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> - 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> - 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> - 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS - 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> - 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> - 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> - 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> - 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS - 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> - 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> - 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS - 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> - 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> - 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> - 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS - 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS - 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> - 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> - 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> - 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> - 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> - 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS - 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> - 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> - 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS - 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> - 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> - 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> - 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS - 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> - 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> - 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS - 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> - 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> - 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> - 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> - 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> - 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> - 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> - 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> - 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> - 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS - 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> - 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> - 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> - 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS - 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> - 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> - 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> - 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS - 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS - 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> - 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> - 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS - 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> - 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> - 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS - 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> - 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS - 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> - 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> - 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> - 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> - 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> - 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> - 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS - 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> - 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> - 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> - 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> - 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS - 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> - 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> - 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> - 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> - 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS - 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> - 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> - 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> - 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> - 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> - 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> - 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> - 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> - 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> - 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> - 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> - 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> - 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> - 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS - 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS - 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> - 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> - 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS - 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> - 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> - 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS - 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> - 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> - 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS - 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS - 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> - 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> - 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> - 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS - 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> - 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS - 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> - 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS - 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> - 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> - 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> - 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> - 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> - 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> - 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> - 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> - 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> - 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> - 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> - 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> - 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> - 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> - 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> - 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> - 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> - 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS - 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS - 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> - 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> - 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS - 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS - 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS - 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> - 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS - 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> - 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS - 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> - 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> - 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> - 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> - 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> - 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> - 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS - 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS - 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> - 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> - 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> - 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> - 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> - 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> - 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> - 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> - 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> - 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> - 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> - 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> - 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> - 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> - 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> - 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> - 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> - 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> - 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> - 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> - 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> - 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> - 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> - 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> - 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS - 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS - 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS - 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> - 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> - 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> - 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS - 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS - 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> - 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> - 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS - 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> - 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> - 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> - 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS - 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> - 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> - 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> - 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> - 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> - 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> - 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> - 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> - 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> - 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> - 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> - 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> - 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS - 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> - 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> - 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> - 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS - 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> - 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> - 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS - 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS - 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> - 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> - 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> - 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS - 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> - 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> - 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS - 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS - 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> - 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> - 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> - 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> - 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> - 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> - 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> - 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> - 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> - 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> - 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> - 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> - 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> - 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> - 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> - 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> - 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS - 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> - 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> - 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> - 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> - 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> - 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> - 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> - 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> - 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> - 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> - 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> - 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> - 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> - 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> - 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> - 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS - 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> - 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> - 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> - 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> - 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> - 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> - 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> - 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS - 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> - 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS - 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> - 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> - 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> - 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> - 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> - 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> - 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS - 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> - 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> - 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> - 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> - 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> - 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> - 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> - 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> - 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> - 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> - 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> - 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> - 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> - 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> - 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> - 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS - 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> - 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> - 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> - 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> - 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> - 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> - 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> - 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> - 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> - 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> - 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> - 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> - 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> - 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> - 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> - 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> - 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> - 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> - 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> - 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> - 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS - 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> - 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> - 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> - 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> - 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS - 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> - 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> - 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> - 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS - 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> - 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> - 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> - 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> - 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> - 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> - 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> - 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> - 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> - 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> - 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> - 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> - 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> - 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS - 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> - 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> - 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> - 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS - 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS - 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> - 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> - 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS - 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> - 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> - 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> - 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> - 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> - 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> - 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> - 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS - 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS - 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> - 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> - 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> - 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> - 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> - 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> - 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> - 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> - 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> - 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS - 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> - 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> - 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> - 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS - 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> - 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> - 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> - 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> - 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> - 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS - 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> - 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS - 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> - 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS - 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> - 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> - 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> - 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> - 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> - 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> - 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> - 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> - 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> - 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS - 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> - 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS - 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> - 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> - 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> - 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> - 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> - 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> - 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> - 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> - 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> - 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> - 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> - 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> - 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> - 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS - 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> - 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> - 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> - 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS - 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS - 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> - 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> - 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> - 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> - 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> - 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> - 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> - 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS - 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> - 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> - 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> - 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS - 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS - 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> - 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS - 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> - 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> - 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> - 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> - 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> - 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> - 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> - 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS - 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> - 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> - 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS - 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> - 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS - 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> - 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> - 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS - 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS - 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> - 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> - 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS - 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> - 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS - 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> - 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> - 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> - 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> - 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> - 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> - 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> - 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> - 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> - 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS - 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> - 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> - 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS - 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> - 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> - 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> - 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> - 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> - 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> - 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> - 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> - 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> - 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> - 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> - 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS - 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> - 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> - 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS - 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> - 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> - 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> - 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS - 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> - 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> - 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> - 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS - 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS - 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> - 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> - 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS - 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS - 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS - 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> - 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> - 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS - 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> - 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> - 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> - 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS - 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS - 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> - 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS - 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> - 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS - 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> - 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> - 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> - 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS - 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> - 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS - 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS - 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> - 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> - 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> - 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> - 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> - 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS - 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS - 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS - 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> - 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS - 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS - 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> - 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> - 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS - 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> - 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> - 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> - 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> - 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> - 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> - 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS - 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> - 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> - 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> - 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> - 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> - 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> - 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> - 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> - 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> - 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS - 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> - 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> - 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> - 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS - 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> - 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> - 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> - 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> - 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS - 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> - 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> - 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> - 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS - 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> - 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> - 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> - 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> - 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> - 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> - 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS - 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> - 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS - 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> - 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> - 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS - 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> - 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> - 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> - 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> - 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> - 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> - 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS - 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS - 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> - 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> - 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS - 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS - 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> - 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> - 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> - 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS - 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> - 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> - 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> - 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> - 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> - 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> - 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> - 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS - 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> - 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> - 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> - 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS - 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS - 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> - 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS - 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> - 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS - 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> - 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> - 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> - 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> - 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> - 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> - 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS - 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> - 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> - 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> - 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS - 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> - 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> - 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> - 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> - 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> - 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> - 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> - 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> - 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> - 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> - 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> - 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> - 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> - 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> - 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> - 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> - 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> - 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> - 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> - 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> - 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> - 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS - 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> - 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> - 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> - 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS - 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> - 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> - 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS - 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS - 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> - 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> - 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS - 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS - 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> - 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> - 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS - 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS - 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS - 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> - 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> - 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS - 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS - 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> - 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> - 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> - 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS - 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> - 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> - 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> - 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> - 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> - 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> - 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> - 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS - 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> - 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> - 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS - 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS - 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> - 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> - 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS - 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> - 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> - 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> - 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> - 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> - 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> - 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> - 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> - 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> - 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> - 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> - 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> - 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> - 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> - 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> - 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> - 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> - 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> - 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS - 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> - 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> - 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> - 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS - 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> - 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> - 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> - 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> - 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> - 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> - 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> - 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> - 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> - 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> - 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> - 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> - 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> - 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> - 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> - 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> - 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> - 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> - 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> - 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> - 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS - 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> - 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> - 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> - 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS - 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> - 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> - 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> - 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS - 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS - 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> - 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> - 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> - 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS - 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> - 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> - 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> - 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> - 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> - 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> - 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> - 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> - 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> - 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> - 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> - 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> - 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> - 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS - 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> - 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> - 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> - 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS - 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> - 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> - 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> - 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> - 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> - 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> - 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> - 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> - 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> - 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> - 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS - 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> - 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> - 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> - 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> - 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> - 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> - 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> - 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> - 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> - 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> - 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> - 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> - 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> - 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> - 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> - 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> - 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> - 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> - 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> - 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> - 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> - 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> - 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> - 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> - 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> - 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS - 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> - 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> - 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> - 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS - 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS - 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS - 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS - 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS - 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> - 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> - 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> - 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS - 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS - 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> - 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS - 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> - 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> - 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> - 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS - 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> - 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS - 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> - 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS - 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> - 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> - 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> - 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> - 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> - 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> - 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> - 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS - 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> - 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS - 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS - 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> - 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS - 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> - 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> - 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> - 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> - 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> - 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> - 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> - 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS - 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> - 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> - 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> - 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS - 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> - 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> - 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> - 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> - 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS - 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS - 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> - 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> - 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> - 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> - 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> - 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> - 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS - 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> - 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> - 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> - 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> - 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> - 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> - 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> - 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> - 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> - 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> - 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS - 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> - 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> - 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> - 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS - 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> - 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> - 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS - 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS - 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> - 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> - 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> - 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> - 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> - 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> - 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS - 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS - 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS - 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> - 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> - 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> - 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS - 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> - 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> - 27705344U, // <4,5,6,7>: Cost 0 copy RHS - 27705344U, // <4,5,6,u>: Cost 0 copy RHS - 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS - 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> - 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> - 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> - 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS - 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> - 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> - 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> - 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS - 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS - 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> - 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> - 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS - 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> - 27705344U, // <4,5,u,7>: Cost 0 copy RHS - 27705344U, // <4,5,u,u>: Cost 0 copy RHS - 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> - 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> - 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> - 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> - 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> - 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> - 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS - 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS - 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> - 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> - 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> - 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> - 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> - 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> - 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> - 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS - 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> - 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> - 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> - 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> - 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> - 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> - 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> - 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> - 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> - 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> - 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> - 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> - 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> - 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> - 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> - 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> - 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> - 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> - 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> - 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS - 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> - 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> - 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> - 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS - 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS - 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> - 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS - 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS - 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> - 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> - 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> - 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> - 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> - 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> - 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS - 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS - 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS - 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> - 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> - 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> - 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> - 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> - 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> - 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS - 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS - 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> - 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> - 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> - 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> - 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> - 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> - 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> - 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS - 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS - 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> - 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> - 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS - 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS - 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS - 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> - 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> - 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> - 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> - 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> - 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> - 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> - 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS - 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> - 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> - 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> - 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> - 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS - 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> - 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> - 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> - 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> - 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS - 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> - 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> - 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> - 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> - 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> - 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> - 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> - 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> - 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> - 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> - 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> - 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> - 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> - 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> - 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> - 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> - 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> - 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> - 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> - 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> - 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> - 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> - 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS - 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> - 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> - 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS - 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> - 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> - 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> - 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> - 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> - 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> - 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> - 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> - 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> - 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS - 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> - 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> - 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> - 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS - 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> - 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> - 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS - 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS - 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> - 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> - 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> - 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> - 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> - 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> - 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> - 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> - 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS - 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> - 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> - 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS - 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> - 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> - 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS - 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> - 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> - 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> - 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> - 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> - 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> - 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> - 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS - 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> - 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> - 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> - 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> - 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> - 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> - 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> - 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS - 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> - 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> - 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> - 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> - 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> - 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> - 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> - 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> - 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> - 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> - 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> - 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> - 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> - 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> - 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> - 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> - 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS - 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> - 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> - 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> - 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS - 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS - 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> - 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS - 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS - 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> - 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS - 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS - 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS - 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS - 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> - 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS - 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> - 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS - 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> - 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS - 27705344U, // <4,u,6,7>: Cost 0 copy RHS - 27705344U, // <4,u,6,u>: Cost 0 copy RHS - 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS - 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> - 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> - 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> - 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS - 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> - 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS - 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS - 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> - 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS - 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 27705344U, // <4,u,u,7>: Cost 0 copy RHS - 27705344U, // <4,u,u,u>: Cost 0 copy RHS - 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> - 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> - 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> - 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> - 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> - 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> - 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> - 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> - 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> - 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS - 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> - 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> - 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS - 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> - 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> - 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> - 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> - 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> - 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> - 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> - 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> - 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> - 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> - 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> - 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> - 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> - 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> - 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> - 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> - 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> - 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> - 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> - 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS - 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> - 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> - 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS - 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS - 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> - 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> - 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS - 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS - 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> - 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> - 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> - 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> - 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS - 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS - 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> - 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> - 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> - 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> - 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> - 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> - 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> - 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS - 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS - 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> - 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> - 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> - 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS - 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> - 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> - 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> - 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS - 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> - 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> - 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> - 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> - 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS - 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> - 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> - 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> - 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> - 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> - 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> - 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> - 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> - 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS - 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> - 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> - 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> - 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> - 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> - 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> - 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> - 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> - 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> - 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> - 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> - 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> - 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> - 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> - 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> - 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> - 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> - 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> - 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS - 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> - 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> - 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> - 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> - 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> - 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> - 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> - 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> - 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> - 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> - 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> - 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> - 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> - 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> - 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> - 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> - 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> - 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> - 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> - 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> - 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> - 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> - 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> - 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> - 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> - 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS - 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> - 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> - 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> - 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS - 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> - 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> - 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> - 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> - 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS - 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> - 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> - 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS - 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS - 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> - 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> - 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> - 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS - 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> - 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> - 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS - 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> - 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> - 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> - 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS - 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> - 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> - 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> - 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> - 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> - 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> - 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> - 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS - 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> - 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> - 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> - 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> - 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS - 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> - 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> - 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> - 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> - 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS - 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> - 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> - 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> - 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> - 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> - 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> - 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> - 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> - 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> - 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> - 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> - 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> - 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> - 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> - 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> - 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> - 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> - 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> - 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> - 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> - 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> - 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> - 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> - 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> - 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS - 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS - 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> - 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> - 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS - 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS - 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> - 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> - 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> - 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS - 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS - 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> - 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> - 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> - 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> - 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> - 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> - 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> - 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> - 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS - 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> - 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> - 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS - 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS - 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> - 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> - 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> - 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS - 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> - 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> - 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> - 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> - 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> - 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> - 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> - 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> - 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> - 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> - 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> - 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> - 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> - 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> - 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> - 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> - 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> - 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> - 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> - 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> - 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> - 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> - 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> - 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> - 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> - 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> - 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> - 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> - 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> - 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> - 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> - 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> - 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> - 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> - 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> - 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> - 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> - 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> - 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> - 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> - 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> - 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> - 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> - 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> - 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> - 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> - 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> - 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> - 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> - 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> - 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> - 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> - 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS - 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> - 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> - 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> - 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS - 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> - 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> - 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> - 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS - 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS - 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> - 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> - 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> - 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS - 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> - 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> - 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> - 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS - 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS - 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> - 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> - 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> - 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS - 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> - 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> - 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> - 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS - 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS - 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> - 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> - 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> - 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS - 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> - 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> - 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> - 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS - 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS - 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS - 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> - 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> - 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> - 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> - 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> - 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> - 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS - 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> - 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> - 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> - 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> - 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> - 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> - 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> - 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> - 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> - 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> - 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> - 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> - 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> - 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> - 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> - 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> - 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> - 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> - 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS - 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> - 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> - 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> - 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> - 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> - 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> - 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> - 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> - 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS - 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> - 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> - 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> - 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> - 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> - 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> - 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> - 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> - 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS - 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> - 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> - 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> - 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS - 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> - 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS - 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS - 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS - 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> - 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> - 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> - 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS - 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> - 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> - 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> - 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS - 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS - 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> - 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> - 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> - 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS - 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> - 94817590U, // <5,4,7,6>: Cost 1 vrev RHS - 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> - 94965064U, // <5,4,7,u>: Cost 1 vrev RHS - 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS - 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> - 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> - 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> - 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS - 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> - 94825783U, // <5,4,u,6>: Cost 1 vrev RHS - 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> - 94973257U, // <5,4,u,u>: Cost 1 vrev RHS - 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> - 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> - 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> - 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> - 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> - 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> - 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS - 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS - 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> - 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> - 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> - 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> - 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> - 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> - 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> - 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> - 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> - 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS - 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> - 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> - 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> - 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> - 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> - 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> - 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> - 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> - 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> - 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> - 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> - 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> - 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> - 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> - 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> - 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> - 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> - 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> - 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> - 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> - 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> - 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS - 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> - 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> - 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> - 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> - 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> - 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> - 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS - 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> - 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> - 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS - 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS - 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> - 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> - 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> - 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> - 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> - 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> - 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> - 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> - 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS - 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> - 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> - 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> - 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS - 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> - 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> - 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS - 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS - 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS - 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> - 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> - 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS - 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> - 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS - 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS - 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> - 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS - 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> - 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> - 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> - 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> - 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> - 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS - 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS - 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> - 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> - 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> - 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> - 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> - 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> - 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> - 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> - 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> - 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> - 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> - 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> - 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> - 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> - 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> - 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> - 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> - 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> - 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> - 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> - 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> - 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> - 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> - 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> - 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS - 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> - 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS - 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> - 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> - 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> - 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> - 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS - 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> - 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> - 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS - 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS - 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> - 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> - 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> - 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> - 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> - 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> - 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS - 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS - 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS - 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> - 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> - 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> - 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS - 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> - 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> - 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS - 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS - 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> - 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS - 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> - 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> - 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> - 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS - 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS - 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS - 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS - 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS - 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> - 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> - 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS - 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> - 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> - 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> - 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> - 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> - 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> - 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> - 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> - 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> - 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> - 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> - 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS - 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> - 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> - 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> - 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> - 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> - 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> - 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> - 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> - 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> - 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> - 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> - 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> - 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> - 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> - 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> - 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> - 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> - 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> - 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> - 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> - 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> - 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> - 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS - 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> - 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> - 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> - 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS - 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> - 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS - 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS - 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> - 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> - 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> - 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS - 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> - 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> - 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS - 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS - 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> - 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> - 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> - 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> - 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> - 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> - 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> - 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> - 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS - 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> - 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> - 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> - 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS - 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> - 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> - 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> - 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS - 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS - 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> - 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS - 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS - 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> - 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS - 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> - 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS - 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> - 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> - 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> - 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> - 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> - 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> - 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS - 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> - 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> - 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> - 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> - 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> - 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> - 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> - 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> - 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> - 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> - 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> - 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> - 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> - 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> - 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> - 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> - 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> - 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> - 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> - 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> - 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> - 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> - 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> - 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> - 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> - 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> - 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> - 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS - 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> - 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS - 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> - 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> - 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> - 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS - 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS - 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS - 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS - 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> - 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> - 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS - 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS - 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> - 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS - 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> - 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS - 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS - 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 118708378U, // <5,u,7,6>: Cost 1 vrev RHS - 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS - 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS - 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS - 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS - 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS - 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS - 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS - 118716571U, // <5,u,u,6>: Cost 1 vrev RHS - 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS - 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS - 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> - 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> - 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> - 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> - 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> - 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> - 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> - 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> - 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> - 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS - 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> - 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> - 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS - 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> - 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> - 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> - 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> - 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> - 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> - 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> - 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> - 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> - 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> - 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> - 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> - 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> - 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> - 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> - 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> - 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> - 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> - 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> - 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> - 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> - 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> - 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> - 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> - 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> - 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> - 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> - 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> - 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> - 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS - 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> - 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> - 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> - 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> - 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> - 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> - 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> - 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS - 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS - 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> - 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> - 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> - 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> - 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> - 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS - 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS - 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> - 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> - 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> - 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS - 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> - 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> - 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> - 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS - 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> - 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> - 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> - 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> - 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> - 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS - 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS - 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> - 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> - 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS - 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> - 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> - 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> - 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> - 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> - 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> - 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> - 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> - 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> - 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> - 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> - 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> - 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> - 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS - 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> - 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> - 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> - 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS - 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> - 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> - 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> - 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> - 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS - 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> - 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> - 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> - 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> - 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> - 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> - 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> - 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> - 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> - 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> - 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> - 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> - 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS - 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> - 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> - 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> - 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> - 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> - 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> - 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> - 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> - 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> - 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> - 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> - 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS - 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> - 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS - 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> - 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> - 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS - 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS - 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> - 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> - 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> - 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS - 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS - 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> - 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> - 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS - 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS - 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> - 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> - 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> - 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS - 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS - 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> - 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> - 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> - 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> - 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> - 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> - 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> - 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> - 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> - 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> - 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> - 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> - 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> - 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> - 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> - 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS - 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> - 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> - 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> - 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS - 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> - 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> - 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> - 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> - 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> - 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> - 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> - 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> - 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> - 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> - 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> - 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> - 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> - 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> - 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> - 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> - 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> - 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> - 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> - 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> - 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> - 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> - 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> - 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> - 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> - 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> - 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> - 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> - 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> - 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> - 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> - 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> - 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> - 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> - 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> - 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> - 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> - 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> - 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> - 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> - 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> - 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> - 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> - 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> - 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> - 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> - 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> - 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS - 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> - 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> - 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS - 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS - 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> - 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> - 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> - 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS - 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> - 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> - 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS - 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> - 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> - 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS - 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> - 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> - 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> - 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> - 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> - 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> - 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> - 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> - 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> - 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> - 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> - 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> - 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> - 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> - 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> - 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> - 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> - 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> - 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> - 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> - 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> - 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> - 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> - 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> - 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> - 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> - 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> - 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> - 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> - 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> - 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> - 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> - 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS - 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> - 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> - 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> - 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS - 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> - 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> - 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> - 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> - 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS - 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> - 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> - 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> - 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> - 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> - 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> - 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> - 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> - 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS - 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> - 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> - 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> - 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> - 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> - 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> - 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> - 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS - 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS - 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> - 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> - 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> - 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS - 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> - 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> - 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS - 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS - 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> - 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> - 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> - 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS - 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> - 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> - 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS - 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> - 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> - 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> - 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> - 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> - 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> - 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> - 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS - 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> - 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> - 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> - 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS - 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS - 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> - 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> - 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> - 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> - 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> - 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> - 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> - 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> - 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> - 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS - 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> - 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> - 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> - 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> - 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> - 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> - 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> - 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> - 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> - 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> - 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> - 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> - 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS - 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> - 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> - 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> - 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> - 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> - 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS - 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> - 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> - 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> - 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS - 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> - 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> - 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> - 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> - 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS - 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> - 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> - 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> - 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS - 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS - 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> - 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> - 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> - 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS - 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> - 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> - 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> - 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS - 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> - 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> - 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> - 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS - 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> - 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS - 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> - 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> - 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> - 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> - 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> - 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> - 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> - 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> - 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> - 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> - 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS - 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> - 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> - 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> - 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> - 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> - 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS - 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> - 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> - 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> - 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS - 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> - 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> - 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS - 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> - 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> - 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> - 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> - 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> - 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> - 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> - 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> - 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> - 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> - 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS - 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> - 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> - 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> - 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS - 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS - 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> - 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> - 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> - 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS - 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> - 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> - 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> - 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS - 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> - 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> - 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> - 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> - 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> - 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> - 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> - 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> - 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> - 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> - 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> - 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> - 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> - 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS - 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> - 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> - 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> - 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS - 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> - 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> - 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS - 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS - 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS - 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> - 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> - 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> - 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS - 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS - 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> - 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> - 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> - 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS - 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> - 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> - 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> - 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> - 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> - 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS - 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS - 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> - 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> - 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> - 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> - 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> - 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> - 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> - 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS - 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> - 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> - 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> - 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> - 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> - 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> - 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> - 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> - 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> - 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> - 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> - 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> - 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> - 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> - 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> - 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> - 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS - 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> - 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS - 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> - 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> - 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> - 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS - 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> - 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> - 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> - 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS - 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> - 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> - 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> - 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> - 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> - 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> - 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS - 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS - 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS - 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> - 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> - 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> - 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS - 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> - 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS - 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> - 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS - 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS - 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> - 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> - 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> - 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS - 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> - 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> - 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS - 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS - 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS - 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> - 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS - 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS - 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS - 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS - 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS - 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS - 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> - 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> - 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> - 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS - 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> - 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> - 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> - 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> - 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> - 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> - 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> - 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> - 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> - 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> - 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> - 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> - 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> - 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS - 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> - 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS - 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> - 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> - 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> - 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> - 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> - 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> - 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> - 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> - 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> - 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> - 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> - 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> - 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> - 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> - 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> - 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> - 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> - 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> - 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> - 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> - 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> - 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> - 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS - 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> - 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> - 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS - 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> - 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> - 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS - 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS - 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> - 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> - 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> - 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS - 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS - 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> - 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS - 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS - 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> - 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> - 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> - 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> - 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> - 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> - 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> - 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS - 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> - 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS - 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS - 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> - 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> - 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS - 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> - 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> - 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS - 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> - 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS - 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS - 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS - 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> - 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> - 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS - 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS - 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> - 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> - 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS - 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS - 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS - 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS - 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS - 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS - 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS - 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS - 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS - 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> - 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> - 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> - 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> - 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> - 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> - 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> - 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> - 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS - 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> - 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS - 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> - 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS - 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> - 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> - 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> - 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> - 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> - 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> - 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> - 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> - 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> - 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> - 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> - 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> - 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> - 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> - 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> - 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> - 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> - 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> - 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> - 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> - 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> - 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> - 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> - 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> - 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS - 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> - 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> - 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> - 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS - 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> - 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> - 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> - 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> - 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> - 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> - 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> - 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> - 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> - 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> - 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> - 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> - 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> - 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> - 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> - 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> - 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> - 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> - 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS - 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS - 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> - 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> - 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> - 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> - 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> - 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> - 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> - 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> - 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS - 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> - 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> - 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS - 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> - 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> - 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS - 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS - 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS - 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS - 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> - 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS - 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> - 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> - 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> - 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> - 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> - 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> - 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> - 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> - 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> - 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> - 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> - 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> - 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> - 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> - 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> - 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> - 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> - 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> - 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> - 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> - 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> - 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> - 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> - 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> - 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> - 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> - 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> - 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> - 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> - 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> - 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> - 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> - 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> - 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> - 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS - 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> - 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> - 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS - 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS - 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> - 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> - 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> - 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS - 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> - 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> - 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> - 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> - 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> - 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> - 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> - 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> - 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> - 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> - 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> - 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> - 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> - 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> - 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> - 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> - 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS - 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS - 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> - 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> - 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> - 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS - 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS - 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> - 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> - 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> - 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS - 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> - 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> - 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS - 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> - 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> - 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> - 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> - 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> - 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> - 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> - 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> - 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> - 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> - 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> - 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> - 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> - 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> - 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> - 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> - 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> - 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> - 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> - 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> - 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> - 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> - 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> - 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> - 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> - 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> - 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> - 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> - 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> - 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> - 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> - 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> - 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> - 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> - 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> - 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> - 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> - 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> - 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> - 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> - 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> - 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> - 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> - 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> - 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> - 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> - 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> - 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> - 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> - 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> - 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> - 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> - 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> - 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> - 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS - 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> - 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> - 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> - 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS - 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> - 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> - 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> - 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> - 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> - 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> - 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> - 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS - 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS - 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> - 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> - 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> - 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS - 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> - 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> - 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> - 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> - 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> - 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> - 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> - 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> - 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> - 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> - 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> - 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> - 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> - 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> - 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> - 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> - 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> - 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> - 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> - 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> - 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> - 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> - 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> - 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> - 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> - 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> - 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> - 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> - 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> - 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> - 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> - 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> - 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> - 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> - 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> - 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> - 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> - 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> - 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> - 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> - 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> - 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> - 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> - 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> - 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> - 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> - 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> - 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> - 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> - 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> - 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> - 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS - 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> - 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> - 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> - 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> - 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> - 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> - 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> - 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> - 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> - 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> - 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> - 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> - 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> - 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> - 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> - 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> - 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> - 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> - 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> - 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> - 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> - 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> - 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> - 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> - 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> - 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> - 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> - 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> - 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> - 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> - 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> - 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> - 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> - 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> - 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> - 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> - 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> - 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> - 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> - 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> - 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> - 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> - 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> - 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> - 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> - 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> - 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> - 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> - 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> - 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> - 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> - 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> - 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> - 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> - 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> - 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> - 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> - 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> - 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> - 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> - 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> - 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> - 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> - 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> - 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> - 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> - 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> - 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> - 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> - 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> - 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> - 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> - 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> - 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> - 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> - 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> - 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> - 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS - 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> - 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> - 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> - 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS - 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> - 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS - 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> - 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS - 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> - 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> - 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> - 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> - 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> - 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> - 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> - 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> - 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> - 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> - 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> - 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> - 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> - 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> - 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> - 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> - 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> - 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS - 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> - 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> - 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> - 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> - 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS - 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> - 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS - 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS - 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS - 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> - 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> - 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> - 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> - 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> - 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> - 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> - 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> - 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> - 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> - 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> - 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> - 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> - 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> - 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> - 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> - 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> - 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> - 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> - 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> - 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> - 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> - 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> - 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS - 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> - 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> - 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> - 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS - 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> - 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> - 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> - 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> - 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS - 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> - 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> - 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> - 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS - 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS - 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> - 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> - 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> - 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> - 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> - 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> - 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> - 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> - 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> - 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> - 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> - 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> - 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> - 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> - 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> - 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> - 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> - 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> - 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> - 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> - 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> - 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS - 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> - 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> - 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> - 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS - 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> - 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> - 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> - 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> - 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS - 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> - 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> - 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS - 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> - 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> - 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> - 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> - 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> - 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> - 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> - 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> - 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> - 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> - 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> - 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS - 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> - 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> - 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> - 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> - 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS - 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> - 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> - 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> - 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> - 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> - 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> - 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> - 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> - 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> - 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> - 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> - 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> - 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> - 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> - 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> - 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> - 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> - 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> - 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> - 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> - 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> - 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> - 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> - 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> - 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> - 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> - 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> - 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS - 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> - 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> - 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS - 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS - 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> - 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> - 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> - 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> - 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> - 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> - 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> - 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> - 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> - 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> - 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> - 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> - 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> - 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> - 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> - 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> - 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> - 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> - 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> - 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> - 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> - 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> - 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> - 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> - 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> - 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> - 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> - 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> - 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> - 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS - 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> - 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> - 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> - 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> - 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> - 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> - 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> - 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> - 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> - 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> - 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> - 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> - 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> - 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> - 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> - 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS - 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> - 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> - 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> - 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> - 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> - 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> - 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> - 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> - 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> - 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> - 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> - 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> - 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> - 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> - 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> - 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> - 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> - 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> - 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> - 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> - 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> - 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> - 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> - 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> - 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> - 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> - 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> - 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> - 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> - 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> - 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> - 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS - 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> - 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> - 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> - 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> - 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> - 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> - 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> - 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> - 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS - 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> - 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> - 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> - 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS - 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> - 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> - 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> - 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> - 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS - 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> - 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> - 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> - 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS - 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> - 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> - 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS - 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS - 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> - 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> - 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> - 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS - 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> - 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> - 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS - 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> - 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> - 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> - 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> - 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> - 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> - 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS - 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> - 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS - 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS - 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> - 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS - 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> - 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> - 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> - 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> - 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> - 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> - 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> - 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> - 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> - 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> - 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> - 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> - 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> - 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> - 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> - 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> - 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> - 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> - 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> - 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> - 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> - 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> - 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> - 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> - 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS - 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> - 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> - 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> - 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> - 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> - 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS - 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> - 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS - 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> - 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> - 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> - 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> - 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> - 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> - 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> - 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> - 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> - 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> - 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> - 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> - 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> - 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> - 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> - 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS - 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> - 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> - 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS - 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> - 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> - 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> - 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS - 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS - 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS - 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS - 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> - 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> - 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS - 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> - 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> - 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS - 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS - 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS - 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS - 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> - 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS - 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> - 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> - 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> - 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS - 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> - 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS - 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> - 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> - 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> - 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> - 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> - 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> - 72589981U, // <u,0,3,2>: Cost 1 vrev LHS - 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> - 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> - 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> - 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> - 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> - 73032403U, // <u,0,3,u>: Cost 1 vrev LHS - 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> - 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> - 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> - 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> - 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS - 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS - 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> - 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> - 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS - 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS - 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> - 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> - 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> - 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> - 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> - 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS - 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> - 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS - 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> - 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS - 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> - 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS - 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> - 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> - 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> - 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS - 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS - 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> - 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> - 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> - 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS - 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> - 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> - 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> - 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS - 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS - 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> - 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS - 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> - 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> - 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS - 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> - 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS - 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS - 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> - 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS - 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> - 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> - 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS - 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> - 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> - 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> - 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> - 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS - 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS - 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> - 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> - 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS - 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> - 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> - 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS - 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS - 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> - 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> - 835584U, // <u,1,2,3>: Cost 0 copy LHS - 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS - 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> - 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> - 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> - 835584U, // <u,1,2,u>: Cost 0 copy LHS - 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS - 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> - 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> - 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS - 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS - 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> - 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> - 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> - 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> - 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> - 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> - 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> - 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> - 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS - 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS - 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> - 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> - 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> - 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS - 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> - 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> - 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> - 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS - 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> - 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> - 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS - 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> - 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS - 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> - 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> - 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> - 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS - 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> - 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> - 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> - 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> - 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS - 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> - 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> - 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS - 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS - 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> - 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> - 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> - 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS - 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS - 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS - 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> - 835584U, // <u,1,u,3>: Cost 0 copy LHS - 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS - 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> - 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> - 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> - 835584U, // <u,1,u,u>: Cost 0 copy LHS - 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> - 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS - 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> - 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> - 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> - 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> - 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> - 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> - 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS - 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> - 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> - 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> - 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> - 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS - 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> - 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> - 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> - 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> - 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS - 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> - 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS - 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> - 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS - 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> - 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> - 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS - 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS - 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS - 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS - 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> - 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> - 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS - 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> - 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> - 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> - 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS - 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS - 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> - 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS - 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS - 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> - 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> - 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> - 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> - 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> - 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> - 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS - 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> - 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS - 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> - 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> - 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> - 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS - 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> - 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> - 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> - 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> - 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> - 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> - 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> - 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS - 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS - 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> - 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> - 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> - 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS - 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS - 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS - 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS - 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> - 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS - 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS - 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS - 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> - 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS - 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS - 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> - 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> - 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> - 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> - 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS - 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> - 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> - 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> - 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> - 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS - 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> - 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> - 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS - 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> - 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> - 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> - 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> - 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS - 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> - 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> - 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> - 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS - 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS - 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> - 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> - 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> - 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS - 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS - 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> - 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS - 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS - 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> - 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> - 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> - 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> - 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS - 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS - 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> - 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> - 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> - 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> - 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> - 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> - 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> - 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS - 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> - 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> - 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> - 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS - 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> - 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> - 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS - 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> - 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS - 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> - 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS - 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> - 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS - 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> - 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS - 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS - 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> - 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS - 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> - 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> - 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> - 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> - 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> - 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS - 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS - 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> - 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> - 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> - 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS - 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS - 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> - 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> - 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS - 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> - 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> - 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> - 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS - 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> - 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS - 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS - 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> - 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> - 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> - 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> - 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> - 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> - 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> - 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> - 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> - 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS - 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> - 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> - 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS - 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS - 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> - 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS - 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS - 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> - 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> - 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> - 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS - 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS - 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS - 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS - 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS - 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS - 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> - 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> - 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> - 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS - 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> - 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS - 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> - 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS - 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS - 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> - 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> - 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> - 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS - 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> - 96808489U, // <u,4,7,6>: Cost 1 vrev RHS - 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> - 96955963U, // <u,4,7,u>: Cost 1 vrev RHS - 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS - 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS - 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> - 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> - 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS - 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS - 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS - 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS - 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS - 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> - 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS - 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> - 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> - 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> - 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> - 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> - 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS - 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS - 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> - 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> - 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> - 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS - 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> - 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> - 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> - 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> - 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS - 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> - 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> - 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> - 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS - 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> - 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> - 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS - 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> - 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS - 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> - 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> - 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> - 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> - 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> - 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS - 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS - 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS - 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> - 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> - 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> - 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS - 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> - 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> - 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS - 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS - 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> - 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> - 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> - 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS - 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS - 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> - 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> - 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS - 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS - 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> - 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> - 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> - 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS - 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> - 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> - 27705344U, // <u,5,6,7>: Cost 0 copy RHS - 27705344U, // <u,5,6,u>: Cost 0 copy RHS - 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS - 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> - 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> - 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> - 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS - 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> - 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> - 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS - 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS - 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS - 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> - 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> - 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> - 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS - 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS - 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> - 27705344U, // <u,5,u,7>: Cost 0 copy RHS - 27705344U, // <u,5,u,u>: Cost 0 copy RHS - 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> - 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS - 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> - 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> - 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> - 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> - 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> - 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS - 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS - 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> - 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> - 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> - 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> - 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> - 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> - 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> - 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS - 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> - 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS - 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> - 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> - 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> - 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS - 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> - 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> - 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> - 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> - 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> - 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> - 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> - 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> - 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> - 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> - 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> - 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS - 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS - 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS - 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> - 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> - 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> - 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS - 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> - 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS - 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS - 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS - 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> - 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> - 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> - 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> - 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> - 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS - 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> - 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS - 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> - 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> - 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> - 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS - 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> - 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS - 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> - 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS - 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS - 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> - 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> - 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS - 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> - 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS - 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS - 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS - 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS - 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> - 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS - 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS - 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS - 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS - 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS - 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS - 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> - 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> - 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> - 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> - 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS - 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> - 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> - 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS - 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> - 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> - 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> - 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> - 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> - 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> - 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> - 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> - 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> - 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> - 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> - 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> - 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> - 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> - 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> - 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> - 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> - 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> - 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> - 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS - 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> - 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS - 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> - 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> - 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> - 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> - 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> - 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS - 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> - 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> - 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS - 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> - 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> - 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> - 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> - 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> - 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> - 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> - 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> - 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> - 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> - 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS - 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS - 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> - 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS - 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> - 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> - 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> - 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS - 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> - 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS - 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS - 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS - 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS - 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> - 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> - 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> - 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> - 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS - 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS - 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS - 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS - 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> - 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> - 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS - 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS - 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> - 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS - 835584U, // <u,u,2,3>: Cost 0 copy LHS - 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS - 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> - 835584U, // <u,u,2,u>: Cost 0 copy LHS - 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS - 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 120371557U, // <u,u,3,2>: Cost 1 vrev LHS - 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS - 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS - 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> - 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS - 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS - 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS - 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> - 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> - 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> - 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS - 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS - 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> - 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS - 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS - 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> - 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> - 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS - 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS - 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS - 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS - 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS - 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS - 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> - 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> - 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS - 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> - 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS - 27705344U, // <u,u,6,7>: Cost 0 copy RHS - 27705344U, // <u,u,6,u>: Cost 0 copy RHS - 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS - 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> - 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> - 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS - 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS - 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 120699277U, // <u,u,7,6>: Cost 1 vrev RHS - 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS - 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS - 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS - 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS - 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS - 835584U, // <u,u,u,3>: Cost 0 copy LHS - 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS - 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS - 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS - 27705344U, // <u,u,u,7>: Cost 0 copy RHS - 835584U, // <u,u,u,u>: Cost 0 copy LHS - 0 -}; +static const unsigned PerfectShuffleTable[6561 + 1] = { + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 2080972802U, // <0,0,0,1>: Cost 2 ins <0,0,u,1>, lane 2 + 1679065190U, // <0,0,0,2>: Cost 2 vuzpl <0,2,0,2>, LHS + 2085707777U, // <0,0,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2080440323U, // <0,0,0,5>: Cost 2 ins <0,0,0,u>, lane 3 + 2080440323U, // <0,0,0,6>: Cost 2 ins <0,0,0,u>, lane 3 + 2080440323U, // <0,0,0,7>: Cost 2 ins <0,0,0,u>, lane 3 + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 1812774912U, // <0,0,1,0>: Cost 2 vzipl LHS, <0,0,0,0> + 739033190U, // <0,0,1,1>: Cost 1 vzipl LHS, LHS + 1812775076U, // <0,0,1,2>: Cost 2 vzipl LHS, <0,2,0,2> + 2080514051U, // <0,0,1,3>: Cost 2 ins <0,0,1,u>, lane 3 + 1812816210U, // <0,0,1,4>: Cost 2 vzipl LHS, <0,4,1,5> + 2085797889U, // <0,0,1,5>: Cost 2 ins <0,u,1,5>, lane 1 + 2080514051U, // <0,0,1,6>: Cost 2 ins <0,0,1,u>, lane 3 + 2080514051U, // <0,0,1,7>: Cost 2 ins <0,0,1,u>, lane 3 + 739033757U, // <0,0,1,u>: Cost 1 vzipl LHS, LHS + 1946992640U, // <0,0,2,0>: Cost 2 vtrnl LHS, <0,0,0,0> + 1946992650U, // <0,0,2,1>: Cost 2 vtrnl LHS, <0,0,1,1> + 873250918U, // <0,0,2,2>: Cost 1 vtrnl LHS, LHS + 1012113409U, // <0,0,2,3>: Cost 1 ins LHS, lane 1 + 1946992844U, // <0,0,2,4>: Cost 2 vtrnl LHS, <0,2,4,6> + 2080587779U, // <0,0,2,5>: Cost 2 ins <0,0,2,u>, lane 3 + 2085879809U, // <0,0,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2080587779U, // <0,0,2,7>: Cost 2 ins <0,0,2,u>, lane 3 + 873250972U, // <0,0,2,u>: Cost 1 vtrnl LHS, LHS + 2080964610U, // <0,0,3,0>: Cost 2 ins <0,0,u,0>, lane 2 + 2080972802U, // <0,0,3,1>: Cost 2 ins <0,0,u,1>, lane 2 + 2128388096U, // <0,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 2013437973U, // <0,0,3,3>: Cost 2 vtrnr <0,0,2,3>, <0,0,2,3> + 3154739202U, // <0,0,3,4>: Cost 3 ins <0,0,u,4>, lane 2 + 2752809474U, // <0,0,3,5>: Cost 3 vuzpl <0,2,0,2>, <3,4,5,6> + 3154755586U, // <0,0,3,6>: Cost 3 ins <0,0,u,6>, lane 2 + 2818573312U, // <0,0,3,7>: Cost 3 vuzpr <0,0,0,0>, <1,3,5,7> + 2080972802U, // <0,0,3,u>: Cost 2 ins <0,0,u,1>, lane 2 + 2080964610U, // <0,0,4,0>: Cost 2 ins <0,0,u,0>, lane 2 + 1814708326U, // <0,0,4,1>: Cost 2 vzipl <0,4,1,5>, LHS + 1947828326U, // <0,0,4,2>: Cost 2 vtrnl <0,2,4,6>, LHS + 2086002689U, // <0,0,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 1947828428U, // <0,0,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6> + 2081030149U, // <0,0,4,5>: Cost 2 ins <0,0,u,u>, lane 5 + 1679068470U, // <0,0,4,6>: Cost 2 vuzpl <0,2,0,2>, RHS + 3154477059U, // <0,0,4,7>: Cost 3 ins <0,0,4,u>, lane 3 + 1679068488U, // <0,0,4,u>: Cost 2 vuzpl <0,2,0,2>, RHS + 2080964610U, // <0,0,5,0>: Cost 2 ins <0,0,u,0>, lane 2 + 2128527360U, // <0,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0 + 2080980994U, // <0,0,5,2>: Cost 2 ins <0,0,u,2>, lane 2 + 2086076417U, // <0,0,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 3202293760U, // <0,0,5,4>: Cost 3 ins <u,0,5,4>, lane 0 + 1947213953U, // <0,0,5,5>: Cost 2 vtrnl <0,1,5,3>, <0,1,5,3> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 1744833846U, // <0,0,5,7>: Cost 2 vuzpr <0,0,0,0>, RHS + 2128527360U, // <0,0,5,u>: Cost 2 ins <u,0,5,1>, lane 0 + 2080964610U, // <0,0,6,0>: Cost 2 ins <0,0,u,0>, lane 2 + 2080972802U, // <0,0,6,1>: Cost 2 ins <0,0,u,1>, lane 2 + 2128609280U, // <0,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 2086150145U, // <0,0,6,3>: Cost 2 ins <0,u,6,3>, lane 1 + 3202367488U, // <0,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0 + 2617250536U, // <0,0,6,5>: Cost 3 vext2 <0,0,0,0>, <6,5,6,7> + 1947287690U, // <0,0,6,6>: Cost 2 vtrnl <0,1,6,3>, <0,1,6,3> + 2081030149U, // <0,0,6,7>: Cost 2 ins <0,0,u,u>, lane 5 + 2080972802U, // <0,0,6,u>: Cost 2 ins <0,0,u,1>, lane 2 + 2080964610U, // <0,0,7,0>: Cost 2 ins <0,0,u,0>, lane 2 + 2080972802U, // <0,0,7,1>: Cost 2 ins <0,0,u,1>, lane 2 + 2080980994U, // <0,0,7,2>: Cost 2 ins <0,0,u,2>, lane 2 + 2086223873U, // <0,0,7,3>: Cost 2 ins <0,u,7,3>, lane 1 + 3154739202U, // <0,0,7,4>: Cost 3 ins <0,0,u,4>, lane 2 + 2617251265U, // <0,0,7,5>: Cost 3 vext2 <0,0,0,0>, <7,5,6,7> + 3154755586U, // <0,0,7,6>: Cost 3 ins <0,0,u,6>, lane 2 + 1947361427U, // <0,0,7,7>: Cost 2 vtrnl <0,1,7,3>, <0,1,7,3> + 2080972802U, // <0,0,7,u>: Cost 2 ins <0,0,u,1>, lane 2 + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 743678054U, // <0,0,u,1>: Cost 1 vzipl LHS, LHS + 873693286U, // <0,0,u,2>: Cost 1 vtrnl LHS, LHS + 1012113409U, // <0,0,u,3>: Cost 1 ins LHS, lane 1 + 1947435212U, // <0,0,u,4>: Cost 2 vtrnl LHS, <0,2,4,6> + 2085797889U, // <0,0,u,5>: Cost 2 ins <0,u,1,5>, lane 1 + 1679071386U, // <0,0,u,6>: Cost 2 vuzpl <0,2,0,2>, RHS + 2080514051U, // <0,0,u,7>: Cost 2 ins <0,0,1,u>, lane 3 + 873693340U, // <0,0,u,u>: Cost 1 vtrnl LHS, LHS + 2085683201U, // <0,1,0,0>: Cost 2 ins <0,u,0,0>, lane 1 + 1007951877U, // <0,1,0,1>: Cost 1 ins LHS, lane 5 + 1680490598U, // <0,1,0,2>: Cost 2 vuzpl <0,4,1,5>, LHS + 1007910914U, // <0,1,0,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <0,1,0,4>: Cost 2 ins <0,1,u,4>, lane 2 + 2081669122U, // <0,1,0,5>: Cost 2 ins <0,1,u,5>, lane 2 + 2081677314U, // <0,1,0,6>: Cost 2 ins <0,1,u,6>, lane 2 + 2081685506U, // <0,1,0,7>: Cost 2 ins <0,1,u,7>, lane 2 + 1007951877U, // <0,1,0,u>: Cost 1 ins LHS, lane 5 + 1812775670U, // <0,1,1,0>: Cost 2 vzipl LHS, <1,0,3,2> + 1812775732U, // <0,1,1,1>: Cost 2 vzipl LHS, <1,1,1,1> + 1812775830U, // <0,1,1,2>: Cost 2 vzipl LHS, <1,2,3,0> + 1007910914U, // <0,1,1,3>: Cost 1 ins LHS, lane 2 + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 1812817040U, // <0,1,1,5>: Cost 2 vzipl LHS, <1,5,3,7> + 2081677314U, // <0,1,1,6>: Cost 2 ins <0,1,u,6>, lane 2 + 2081685506U, // <0,1,1,7>: Cost 2 ins <0,1,u,7>, lane 2 + 1007910914U, // <0,1,1,u>: Cost 1 ins LHS, lane 2 + 1007509507U, // <0,1,2,0>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,2,1>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,2,2>: Cost 1 ins LHS, lane 3 + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1007509507U, // <0,1,2,4>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,2,5>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,2,6>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,2,7>: Cost 1 ins LHS, lane 3 + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2133680132U, // <0,1,3,0>: Cost 2 ins <u,u,3,0>, lane 4 + 2081636354U, // <0,1,3,1>: Cost 2 ins <0,1,u,1>, lane 2 + 2133696516U, // <0,1,3,2>: Cost 2 ins <u,u,3,2>, lane 4 + 1007910914U, // <0,1,3,3>: Cost 1 ins LHS, lane 2 + 2133712900U, // <0,1,3,4>: Cost 2 ins <u,u,3,4>, lane 4 + 2081669122U, // <0,1,3,5>: Cost 2 ins <0,1,u,5>, lane 2 + 2081677314U, // <0,1,3,6>: Cost 2 ins <0,1,u,6>, lane 2 + 2133737476U, // <0,1,3,7>: Cost 2 ins <u,u,3,7>, lane 4 + 1007910914U, // <0,1,3,u>: Cost 1 ins LHS, lane 2 + 2081628162U, // <0,1,4,0>: Cost 2 ins <0,1,u,0>, lane 2 + 2081636354U, // <0,1,4,1>: Cost 2 ins <0,1,u,1>, lane 2 + 2081644546U, // <0,1,4,2>: Cost 2 ins <0,1,u,2>, lane 2 + 1007910914U, // <0,1,4,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <0,1,4,4>: Cost 2 ins <0,1,u,4>, lane 2 + 1007951877U, // <0,1,4,5>: Cost 1 ins LHS, lane 5 + 1680493878U, // <0,1,4,6>: Cost 2 vuzpl <0,4,1,5>, RHS + 2081685506U, // <0,1,4,7>: Cost 2 ins <0,1,u,7>, lane 2 + 1007910914U, // <0,1,4,u>: Cost 1 ins LHS, lane 2 + 2081628162U, // <0,1,5,0>: Cost 2 ins <0,1,u,0>, lane 2 + 2133835780U, // <0,1,5,1>: Cost 2 ins <u,u,5,1>, lane 4 + 2081644546U, // <0,1,5,2>: Cost 2 ins <0,1,u,2>, lane 2 + 1007910914U, // <0,1,5,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <0,1,5,4>: Cost 2 ins <0,1,u,4>, lane 2 + 2133868548U, // <0,1,5,5>: Cost 2 ins <u,u,5,5>, lane 4 + 2133876740U, // <0,1,5,6>: Cost 2 ins <u,u,5,6>, lane 4 + 2133884932U, // <0,1,5,7>: Cost 2 ins <u,u,5,7>, lane 4 + 1007910914U, // <0,1,5,u>: Cost 1 ins LHS, lane 2 + 2081628162U, // <0,1,6,0>: Cost 2 ins <0,1,u,0>, lane 2 + 2081636354U, // <0,1,6,1>: Cost 2 ins <0,1,u,1>, lane 2 + 2133917700U, // <0,1,6,2>: Cost 2 ins <u,u,6,2>, lane 4 + 1007910914U, // <0,1,6,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <0,1,6,4>: Cost 2 ins <0,1,u,4>, lane 2 + 2081669122U, // <0,1,6,5>: Cost 2 ins <0,1,u,5>, lane 2 + 2133950468U, // <0,1,6,6>: Cost 2 ins <u,u,6,6>, lane 4 + 1060216836U, // <0,1,6,7>: Cost 1 ins RHS, lane 4 + 1007910914U, // <0,1,6,u>: Cost 1 ins LHS, lane 2 + 2133975044U, // <0,1,7,0>: Cost 2 ins <u,u,7,0>, lane 4 + 2081636354U, // <0,1,7,1>: Cost 2 ins <0,1,u,1>, lane 2 + 2081644546U, // <0,1,7,2>: Cost 2 ins <0,1,u,2>, lane 2 + 1007910914U, // <0,1,7,3>: Cost 1 ins LHS, lane 2 + 2134007812U, // <0,1,7,4>: Cost 2 ins <u,u,7,4>, lane 4 + 2081669122U, // <0,1,7,5>: Cost 2 ins <0,1,u,5>, lane 2 + 2134024196U, // <0,1,7,6>: Cost 2 ins <u,u,7,6>, lane 4 + 2134032388U, // <0,1,7,7>: Cost 2 ins <u,u,7,7>, lane 4 + 1007910914U, // <0,1,7,u>: Cost 1 ins LHS, lane 2 + 1007509507U, // <0,1,u,0>: Cost 1 ins LHS, lane 3 + 1007951877U, // <0,1,u,1>: Cost 1 ins LHS, lane 5 + 1007509507U, // <0,1,u,2>: Cost 1 ins LHS, lane 3 + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1007509507U, // <0,1,u,4>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,u,5>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,u,6>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,1,u,7>: Cost 1 ins LHS, lane 3 + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 1678557184U, // <0,2,0,0>: Cost 2 vuzpl LHS, <0,0,0,0> + 1678598154U, // <0,2,0,1>: Cost 2 vuzpl LHS, <0,0,1,1> + 604815462U, // <0,2,0,2>: Cost 1 vuzpl LHS, LHS + 2081767427U, // <0,2,0,3>: Cost 2 ins <0,2,0,u>, lane 3 + 1678598348U, // <0,2,0,4>: Cost 2 vuzpl LHS, <0,2,4,6> + 2081767427U, // <0,2,0,5>: Cost 2 ins <0,2,0,u>, lane 3 + 2082340866U, // <0,2,0,6>: Cost 2 ins <0,2,u,6>, lane 2 + 2081767427U, // <0,2,0,7>: Cost 2 ins <0,2,0,u>, lane 3 + 604815516U, // <0,2,0,u>: Cost 1 vuzpl LHS, LHS + 2752340940U, // <0,2,1,0>: Cost 3 vuzpl LHS, <1,3,0,0> + 1678558004U, // <0,2,1,1>: Cost 2 vuzpl LHS, <1,1,1,1> + 1812776552U, // <0,2,1,2>: Cost 2 vzipl LHS, <2,2,2,2> + 1678557942U, // <0,2,1,3>: Cost 2 vuzpl LHS, <1,0,3,2> + 2752340982U, // <0,2,1,4>: Cost 3 vuzpl LHS, <1,3,4,6> + 1678599168U, // <0,2,1,5>: Cost 2 vuzpl LHS, <1,3,5,7> + 1812817850U, // <0,2,1,6>: Cost 2 vzipl LHS, <2,6,3,7> + 2860466282U, // <0,2,1,7>: Cost 3 vuzpr <7,0,1,2>, <0,1,2,7> + 1678598947U, // <0,2,1,u>: Cost 2 vuzpl LHS, <1,0,u,2> + 1678558886U, // <0,2,2,0>: Cost 2 vuzpl LHS, <2,3,0,1> + 2085838849U, // <0,2,2,1>: Cost 2 ins <0,u,2,1>, lane 1 + 1678558824U, // <0,2,2,2>: Cost 2 vuzpl LHS, <2,2,2,2> + 1012113409U, // <0,2,2,3>: Cost 1 ins LHS, lane 1 + 1678558926U, // <0,2,2,4>: Cost 2 vuzpl LHS, <2,3,4,5> + 2085871617U, // <0,2,2,5>: Cost 2 ins <0,u,2,5>, lane 1 + 2085879809U, // <0,2,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2085888001U, // <0,2,2,7>: Cost 2 ins <0,u,2,7>, lane 1 + 1012113409U, // <0,2,2,u>: Cost 1 ins LHS, lane 1 + 2129698816U, // <0,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0 + 1678559382U, // <0,2,3,1>: Cost 2 vuzpl LHS, <3,0,1,2> + 2082308098U, // <0,2,3,2>: Cost 2 ins <0,2,u,2>, lane 2 + 1678559644U, // <0,2,3,3>: Cost 2 vuzpl LHS, <3,3,3,3> + 2129731584U, // <0,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0 + 1678559746U, // <0,2,3,5>: Cost 2 vuzpl LHS, <3,4,5,6> + 2082340866U, // <0,2,3,6>: Cost 2 ins <0,2,u,6>, lane 2 + 2824782848U, // <0,2,3,7>: Cost 3 vuzpr <1,0,3,2>, <1,3,5,7> + 1678559445U, // <0,2,3,u>: Cost 2 vuzpl LHS, <3,0,u,2> + 2082062339U, // <0,2,4,0>: Cost 2 ins <0,2,4,u>, lane 3 + 2082062339U, // <0,2,4,1>: Cost 2 ins <0,2,4,u>, lane 3 + 2082308098U, // <0,2,4,2>: Cost 2 ins <0,2,u,2>, lane 2 + 2082062339U, // <0,2,4,3>: Cost 2 ins <0,2,4,u>, lane 3 + 2082062339U, // <0,2,4,4>: Cost 2 ins <0,2,4,u>, lane 3 + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 604818742U, // <0,2,4,6>: Cost 1 vuzpl LHS, RHS + 2082062339U, // <0,2,4,7>: Cost 2 ins <0,2,4,u>, lane 3 + 604818760U, // <0,2,4,u>: Cost 1 vuzpl LHS, RHS + 3105260438U, // <0,2,5,0>: Cost 3 vtrnr <3,0,4,5>, <1,2,3,0> + 1678561408U, // <0,2,5,1>: Cost 2 vuzpl LHS, <5,7,1,3> + 2082308098U, // <0,2,5,2>: Cost 2 ins <0,2,u,2>, lane 2 + 2086076417U, // <0,2,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 2756947554U, // <0,2,5,4>: Cost 3 vuzpl LHS, <5,0,4,1> + 1678561284U, // <0,2,5,5>: Cost 2 vuzpl LHS, <5,5,5,5> + 2082340866U, // <0,2,5,6>: Cost 2 ins <0,2,u,6>, lane 2 + 1751043382U, // <0,2,5,7>: Cost 2 vuzpr <1,0,3,2>, RHS + 1751043383U, // <0,2,5,u>: Cost 2 vuzpr <1,0,3,2>, RHS + 1678562126U, // <0,2,6,0>: Cost 2 vuzpl LHS, <6,7,0,1> + 2756948257U, // <0,2,6,1>: Cost 3 vuzpl LHS, <6,0,1,2> + 2082308098U, // <0,2,6,2>: Cost 2 ins <0,2,u,2>, lane 2 + 2086150145U, // <0,2,6,3>: Cost 2 ins <0,u,6,3>, lane 1 + 1678562166U, // <0,2,6,4>: Cost 2 vuzpl LHS, <6,7,4,5> + 2756948621U, // <0,2,6,5>: Cost 3 vuzpl LHS, <6,4,5,6> + 2082340866U, // <0,2,6,6>: Cost 2 ins <0,2,u,6>, lane 2 + 2082357253U, // <0,2,6,7>: Cost 2 ins <0,2,u,u>, lane 5 + 2082308098U, // <0,2,6,u>: Cost 2 ins <0,2,u,2>, lane 2 + 3099378582U, // <0,2,7,0>: Cost 3 vtrnr <2,0,5,7>, <1,2,3,0> + 1678562298U, // <0,2,7,1>: Cost 2 vuzpl LHS, <7,0,1,2> + 2082308098U, // <0,2,7,2>: Cost 2 ins <0,2,u,2>, lane 2 + 2130018304U, // <0,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0 + 2645136742U, // <0,2,7,4>: Cost 3 vext2 <4,6,0,2>, <7,4,5,6> + 1678562662U, // <0,2,7,5>: Cost 2 vuzpl LHS, <7,4,5,6> + 2082340866U, // <0,2,7,6>: Cost 2 ins <0,2,u,6>, lane 2 + 1678562924U, // <0,2,7,7>: Cost 2 vuzpl LHS, <7,7,7,7> + 2082308098U, // <0,2,7,u>: Cost 2 ins <0,2,u,2>, lane 2 + 1947436710U, // <0,2,u,0>: Cost 2 vtrnl LHS, <2,3,0,1> + 1678603987U, // <0,2,u,1>: Cost 2 vuzpl LHS, <u,0,1,2> + 604821294U, // <0,2,u,2>: Cost 1 vuzpl LHS, LHS + 1012113409U, // <0,2,u,3>: Cost 1 ins LHS, lane 1 + 1947436750U, // <0,2,u,4>: Cost 2 vtrnl LHS, <2,3,4,5> + 1678604351U, // <0,2,u,5>: Cost 2 vuzpl LHS, <u,4,5,6> + 604821658U, // <0,2,u,6>: Cost 1 vuzpl LHS, RHS + 1751043625U, // <0,2,u,7>: Cost 2 vuzpr <1,0,3,2>, RHS + 604821348U, // <0,2,u,u>: Cost 1 vuzpl LHS, LHS + 2085683201U, // <0,3,0,0>: Cost 2 ins <0,u,0,0>, lane 1 + 2130149376U, // <0,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0 + 2085699585U, // <0,3,0,2>: Cost 2 ins <0,u,0,2>, lane 1 + 1745002517U, // <0,3,0,3>: Cost 2 vuzpr <0,0,2,3>, <0,0,2,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 3021244930U, // <0,3,0,5>: Cost 3 vtrnl <0,2,0,2>, <3,4,5,6> + 3159474177U, // <0,3,0,6>: Cost 3 ins <0,u,0,6>, lane 1 + 2952791184U, // <0,3,0,7>: Cost 3 vzipr <0,0,0,0>, <1,5,3,7> + 2130149376U, // <0,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0 + 1812777110U, // <0,3,1,0>: Cost 2 vzipl LHS, <3,0,1,2> + 2085765121U, // <0,3,1,1>: Cost 2 ins <0,u,1,1>, lane 1 + 2886519105U, // <0,3,1,2>: Cost 3 vzipl LHS, <3,2,2,2> + 1812777372U, // <0,3,1,3>: Cost 2 vzipl LHS, <3,3,3,3> + 1812777474U, // <0,3,1,4>: Cost 2 vzipl LHS, <3,4,5,6> + 2085797889U, // <0,3,1,5>: Cost 2 ins <0,u,1,5>, lane 1 + 3159547905U, // <0,3,1,6>: Cost 3 ins <0,u,1,6>, lane 1 + 2966733968U, // <0,3,1,7>: Cost 3 vzipr <2,3,0,1>, <1,5,3,7> + 1812777758U, // <0,3,1,u>: Cost 2 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1946994838U, // <0,3,2,1>: Cost 2 vtrnl LHS, <3,0,1,2> + 2085847041U, // <0,3,2,2>: Cost 2 ins <0,u,2,2>, lane 1 + 1012113409U, // <0,3,2,3>: Cost 1 ins LHS, lane 1 + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 1946995202U, // <0,3,2,5>: Cost 2 vtrnl LHS, <3,4,5,6> + 2085879809U, // <0,3,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2085888001U, // <0,3,2,7>: Cost 2 ins <0,u,2,7>, lane 1 + 1012113409U, // <0,3,2,u>: Cost 1 ins LHS, lane 1 + 2887747734U, // <0,3,3,0>: Cost 3 vzipl <0,3,1,0>, <3,0,1,2> + 2753022102U, // <0,3,3,1>: Cost 3 vuzpl <0,2,3,1>, <3,0,1,2> + 2965422838U, // <0,3,3,2>: Cost 3 vzipr <2,1,0,3>, <1,0,3,2> + 2130386944U, // <0,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0 + 2887780866U, // <0,3,3,4>: Cost 3 vzipl <0,3,1,4>, <3,4,5,6> + 2753055234U, // <0,3,3,5>: Cost 3 vuzpl <0,2,3,5>, <3,4,5,6> + 2752375389U, // <0,3,3,6>: Cost 3 vuzpl <0,1,3,3>, <3,5,6,7> + 3204161536U, // <0,3,3,7>: Cost 3 ins <u,3,3,7>, lane 0 + 2130386944U, // <0,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0 + 2888452246U, // <0,3,4,0>: Cost 3 vzipl <0,4,1,5>, <3,0,1,2> + 3021572246U, // <0,3,4,1>: Cost 3 vtrnl <0,2,4,6>, <3,0,1,2> + 3021572257U, // <0,3,4,2>: Cost 3 vtrnl <0,2,4,6>, <3,0,2,4> + 2086002689U, // <0,3,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 2888452610U, // <0,3,4,4>: Cost 3 vzipl <0,4,1,5>, <3,4,5,6> + 2130477056U, // <0,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0 + 2086027265U, // <0,3,4,6>: Cost 2 ins <0,u,4,6>, lane 1 + 2818747621U, // <0,3,4,7>: Cost 3 vuzpr <0,0,2,3>, <4,4,6,7> + 2130477056U, // <0,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0 + 3204251648U, // <0,3,5,0>: Cost 3 ins <u,3,5,0>, lane 0 + 3204259840U, // <0,3,5,1>: Cost 3 ins <u,3,5,1>, lane 0 + 2961457910U, // <0,3,5,2>: Cost 3 vzipr <1,4,0,5>, <1,0,3,2> + 2086076417U, // <0,3,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3204292608U, // <0,3,5,5>: Cost 3 ins <u,3,5,5>, lane 0 + 2653769826U, // <0,3,5,6>: Cost 3 vext2 <6,1,0,3>, <5,6,7,0> + 2130567168U, // <0,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0 + 2130567168U, // <0,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0 + 2854506594U, // <0,3,6,0>: Cost 3 vuzpr <6,0,1,3>, <5,6,7,0> + 2653770090U, // <0,3,6,1>: Cost 3 vext2 <6,1,0,3>, <6,1,0,3> + 3204341760U, // <0,3,6,2>: Cost 3 ins <u,3,6,2>, lane 0 + 2086150145U, // <0,3,6,3>: Cost 2 ins <0,u,6,3>, lane 1 + 3204358144U, // <0,3,6,4>: Cost 3 ins <u,3,6,4>, lane 0 + 3204366336U, // <0,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0 + 3204374528U, // <0,3,6,6>: Cost 3 ins <u,3,6,6>, lane 0 + 2130640896U, // <0,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0 + 2086150145U, // <0,3,6,u>: Cost 2 ins <0,u,6,3>, lane 1 + 2968109974U, // <0,3,7,0>: Cost 3 vzipr <2,5,0,7>, <1,2,3,0> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 2660406420U, // <0,3,7,2>: Cost 3 vext2 <7,2,0,3>, <7,2,0,3> + 2086223873U, // <0,3,7,3>: Cost 2 ins <0,u,7,3>, lane 1 + 3204431872U, // <0,3,7,4>: Cost 3 ins <u,3,7,4>, lane 0 + 3204440064U, // <0,3,7,5>: Cost 3 ins <u,3,7,5>, lane 0 + 2752378305U, // <0,3,7,6>: Cost 3 vuzpl <0,1,3,3>, <7,5,6,7> + 3204456448U, // <0,3,7,7>: Cost 3 ins <u,3,7,7>, lane 0 + 2086223873U, // <0,3,7,u>: Cost 2 ins <0,u,7,3>, lane 1 + 1817421974U, // <0,3,u,0>: Cost 2 vzipl LHS, <3,0,1,2> + 1947437206U, // <0,3,u,1>: Cost 2 vtrnl LHS, <3,0,1,2> + 2085699585U, // <0,3,u,2>: Cost 2 ins <0,u,0,2>, lane 1 + 1012113409U, // <0,3,u,3>: Cost 1 ins LHS, lane 1 + 1817422338U, // <0,3,u,4>: Cost 2 vzipl LHS, <3,4,5,6> + 1947437570U, // <0,3,u,5>: Cost 2 vtrnl LHS, <3,4,5,6> + 2085879809U, // <0,3,u,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2130567168U, // <0,3,u,7>: Cost 2 ins <u,3,5,7>, lane 0 + 1012113409U, // <0,3,u,u>: Cost 1 ins LHS, lane 1 + 2085683201U, // <0,4,0,0>: Cost 2 ins <0,u,0,0>, lane 1 + 2083684357U, // <0,4,0,1>: Cost 2 ins <0,4,u,u>, lane 5 + 1679392870U, // <0,4,0,2>: Cost 2 vuzpl <0,2,4,6>, LHS + 2085707777U, // <0,4,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 1679392972U, // <0,4,0,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6> + 2083659778U, // <0,4,0,5>: Cost 2 ins <0,4,u,5>, lane 2 + 1947503926U, // <0,4,0,6>: Cost 2 vtrnl <0,2,0,2>, RHS + 3156836355U, // <0,4,0,7>: Cost 3 ins <0,4,0,u>, lane 3 + 1947503944U, // <0,4,0,u>: Cost 2 vtrnl <0,2,0,2>, RHS + 2083168259U, // <0,4,1,0>: Cost 2 ins <0,4,1,u>, lane 3 + 2085765121U, // <0,4,1,1>: Cost 2 ins <0,u,1,1>, lane 1 + 2083168259U, // <0,4,1,2>: Cost 2 ins <0,4,1,u>, lane 3 + 2083168259U, // <0,4,1,3>: Cost 2 ins <0,4,1,u>, lane 3 + 2083168259U, // <0,4,1,4>: Cost 2 ins <0,4,1,u>, lane 3 + 739036470U, // <0,4,1,5>: Cost 1 vzipl LHS, RHS + 1948929334U, // <0,4,1,6>: Cost 2 vtrnl <0,4,1,5>, RHS + 2083168259U, // <0,4,1,7>: Cost 2 ins <0,4,1,u>, lane 3 + 739036713U, // <0,4,1,u>: Cost 1 vzipl LHS, RHS + 2083241987U, // <0,4,2,0>: Cost 2 ins <0,4,2,u>, lane 3 + 2083241987U, // <0,4,2,1>: Cost 2 ins <0,4,2,u>, lane 3 + 2085847041U, // <0,4,2,2>: Cost 2 ins <0,u,2,2>, lane 1 + 1012113409U, // <0,4,2,3>: Cost 1 ins LHS, lane 1 + 2083241987U, // <0,4,2,4>: Cost 2 ins <0,4,2,u>, lane 3 + 1813286198U, // <0,4,2,5>: Cost 2 vzipl <0,2,0,2>, RHS + 873254198U, // <0,4,2,6>: Cost 1 vtrnl LHS, RHS + 2083241987U, // <0,4,2,7>: Cost 2 ins <0,4,2,u>, lane 3 + 873254216U, // <0,4,2,u>: Cost 1 vtrnl LHS, RHS + 3020811514U, // <0,4,3,0>: Cost 3 vtrnl <0,1,3,3>, <4,5,0,1> + 2753136790U, // <0,4,3,1>: Cost 3 vuzpl <0,2,4,6>, <3,0,1,2> + 2753136801U, // <0,4,3,2>: Cost 3 vuzpl <0,2,4,6>, <3,0,2,4> + 2085928961U, // <0,4,3,3>: Cost 2 ins <0,u,3,3>, lane 1 + 3204800512U, // <0,4,3,4>: Cost 3 ins <u,4,3,4>, lane 0 + 2083659778U, // <0,4,3,5>: Cost 2 ins <0,4,u,5>, lane 2 + 2083667970U, // <0,4,3,6>: Cost 2 ins <0,4,u,6>, lane 2 + 3087183077U, // <0,4,3,7>: Cost 3 vtrnr <0,0,2,3>, <4,4,6,7> + 2083659778U, // <0,4,3,u>: Cost 2 ins <0,4,u,5>, lane 2 + 2753137995U, // <0,4,4,0>: Cost 3 vuzpl <0,2,4,6>, <4,6,0,1> + 2888453090U, // <0,4,4,1>: Cost 3 vzipl <0,4,1,5>, <4,1,5,0> + 2888535100U, // <0,4,4,2>: Cost 3 vzipl <0,4,2,6>, <4,2,6,0> + 2086002689U, // <0,4,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 2131132416U, // <0,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0 + 1814711606U, // <0,4,4,5>: Cost 2 vzipl <0,4,1,5>, RHS + 1679396150U, // <0,4,4,6>: Cost 2 vuzpl <0,2,4,6>, RHS + 3157131267U, // <0,4,4,7>: Cost 3 ins <0,4,4,u>, lane 3 + 1679396168U, // <0,4,4,u>: Cost 2 vuzpl <0,2,4,6>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3204931584U, // <0,4,5,2>: Cost 3 ins <u,4,5,2>, lane 0 + 2086076417U, // <0,4,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2131214336U, // <0,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0 + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2830699830U, // <0,4,5,7>: Cost 3 vuzpr <2,0,2,4>, RHS + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2712227146U, // <0,4,6,0>: Cost 3 vext3 <4,6,0,0>, <4,6,0,0> + 2753138977U, // <0,4,6,1>: Cost 3 vuzpl <0,2,4,6>, <6,0,1,2> + 2753138988U, // <0,4,6,2>: Cost 3 vuzpl <0,2,4,6>, <6,0,2,4> + 2086150145U, // <0,4,6,3>: Cost 2 ins <0,u,6,3>, lane 1 + 2712522094U, // <0,4,6,4>: Cost 3 vext3 <4,6,4,0>, <4,6,4,0> + 2083659778U, // <0,4,6,5>: Cost 2 ins <0,4,u,5>, lane 2 + 2131296256U, // <0,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0 + 2083684357U, // <0,4,6,7>: Cost 2 ins <0,4,u,u>, lane 5 + 2083659778U, // <0,4,6,u>: Cost 2 ins <0,4,u,5>, lane 2 + 3021106426U, // <0,4,7,0>: Cost 3 vtrnl <0,1,7,3>, <4,5,0,1> + 2860487502U, // <0,4,7,1>: Cost 3 vuzpr <7,0,1,4>, <6,7,0,1> + 3157377026U, // <0,4,7,2>: Cost 3 ins <0,4,u,2>, lane 2 + 2086223873U, // <0,4,7,3>: Cost 2 ins <0,u,7,3>, lane 1 + 3205095424U, // <0,4,7,4>: Cost 3 ins <u,4,7,4>, lane 0 + 2083659778U, // <0,4,7,5>: Cost 2 ins <0,4,u,5>, lane 2 + 2131369984U, // <0,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 2752452204U, // <0,4,7,7>: Cost 3 vuzpl <0,1,4,3>, <7,7,7,7> + 2083659778U, // <0,4,7,u>: Cost 2 ins <0,4,u,5>, lane 2 + 2083168259U, // <0,4,u,0>: Cost 2 ins <0,4,1,u>, lane 3 + 2083684357U, // <0,4,u,1>: Cost 2 ins <0,4,u,u>, lane 5 + 1679398702U, // <0,4,u,2>: Cost 2 vuzpl <0,2,4,6>, LHS + 1012113409U, // <0,4,u,3>: Cost 1 ins LHS, lane 1 + 1679392972U, // <0,4,u,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6> + 743681334U, // <0,4,u,5>: Cost 1 vzipl LHS, RHS + 873696566U, // <0,4,u,6>: Cost 1 vtrnl LHS, RHS + 2083168259U, // <0,4,u,7>: Cost 2 ins <0,4,1,u>, lane 3 + 873696584U, // <0,4,u,u>: Cost 1 vtrnl LHS, RHS + 2085683201U, // <0,5,0,0>: Cost 2 ins <0,u,0,0>, lane 1 + 2131476480U, // <0,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0 + 2085699585U, // <0,5,0,2>: Cost 2 ins <0,u,0,2>, lane 1 + 2085707777U, // <0,5,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 3159457793U, // <0,5,0,4>: Cost 3 ins <0,u,0,4>, lane 1 + 1678778497U, // <0,5,0,5>: Cost 2 vuzpl <0,1,5,3>, <0,1,5,3> + 3159474177U, // <0,5,0,6>: Cost 3 ins <0,u,0,6>, lane 1 + 2013269302U, // <0,5,0,7>: Cost 2 vtrnr <0,0,0,0>, RHS + 2085699585U, // <0,5,0,u>: Cost 2 ins <0,u,0,2>, lane 1 + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2085765121U, // <0,5,1,1>: Cost 2 ins <0,u,1,1>, lane 1 + 3159515137U, // <0,5,1,2>: Cost 3 ins <0,u,1,2>, lane 1 + 2085781505U, // <0,5,1,3>: Cost 2 ins <0,u,1,3>, lane 1 + 1812778950U, // <0,5,1,4>: Cost 2 vzipl LHS, <5,4,7,6> + 2085797889U, // <0,5,1,5>: Cost 2 ins <0,u,1,5>, lane 1 + 1812779106U, // <0,5,1,6>: Cost 2 vzipl LHS, <5,6,7,0> + 2013351222U, // <0,5,1,7>: Cost 2 vtrnr <0,0,1,1>, RHS + 2085765121U, // <0,5,1,u>: Cost 2 ins <0,u,1,1>, lane 1 + 2085830657U, // <0,5,2,0>: Cost 2 ins <0,u,2,0>, lane 1 + 1946996864U, // <0,5,2,1>: Cost 2 vtrnl LHS, <5,7,1,3> + 2085847041U, // <0,5,2,2>: Cost 2 ins <0,u,2,2>, lane 1 + 1012113409U, // <0,5,2,3>: Cost 1 ins LHS, lane 1 + 2085863425U, // <0,5,2,4>: Cost 2 ins <0,u,2,4>, lane 1 + 1946996740U, // <0,5,2,5>: Cost 2 vtrnl LHS, <5,5,5,5> + 2085879809U, // <0,5,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2019478838U, // <0,5,2,7>: Cost 2 vtrnr <1,0,3,2>, RHS + 1012113409U, // <0,5,2,u>: Cost 1 ins LHS, lane 1 + 2637858966U, // <0,5,3,0>: Cost 3 vext2 <3,4,0,5>, <3,0,1,2> + 3205439488U, // <0,5,3,1>: Cost 3 ins <u,5,3,1>, lane 0 + 3087183153U, // <0,5,3,2>: Cost 3 vtrnr <0,0,2,3>, <4,5,6,2> + 2085928961U, // <0,5,3,3>: Cost 2 ins <0,u,3,3>, lane 1 + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3205472256U, // <0,5,3,5>: Cost 3 ins <u,5,3,5>, lane 0 + 3205480448U, // <0,5,3,6>: Cost 3 ins <u,5,3,6>, lane 0 + 2131746816U, // <0,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0 + 2131746816U, // <0,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0 + 2888453704U, // <0,5,4,0>: Cost 3 vzipl <0,4,1,5>, <5,0,1,2> + 3159728129U, // <0,5,4,1>: Cost 3 ins <0,u,4,1>, lane 1 + 3159736321U, // <0,5,4,2>: Cost 3 ins <0,u,4,2>, lane 1 + 2086002689U, // <0,5,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 2888454068U, // <0,5,4,4>: Cost 3 vzipl <0,4,1,5>, <5,4,5,6> + 2131804160U, // <0,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0 + 2086027265U, // <0,5,4,6>: Cost 2 ins <0,u,4,6>, lane 1 + 2131820544U, // <0,5,4,7>: Cost 2 ins <u,5,4,7>, lane 0 + 2086027265U, // <0,5,4,u>: Cost 2 ins <0,u,4,6>, lane 1 + 3205578752U, // <0,5,5,0>: Cost 3 ins <u,5,5,0>, lane 0 + 2997291922U, // <0,5,5,1>: Cost 3 vzipr <7,4,0,5>, <4,0,5,1> + 2752523939U, // <0,5,5,2>: Cost 3 vuzpl <0,1,5,3>, <5,1,2,3> + 2086076417U, // <0,5,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 3205611520U, // <0,5,5,4>: Cost 3 ins <u,5,5,4>, lane 0 + 2131877888U, // <0,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0 + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2131894272U, // <0,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0 + 2086076417U, // <0,5,5,u>: Cost 2 ins <0,u,5,3>, lane 1 + 2131910656U, // <0,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0 + 2131918848U, // <0,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2131927040U, // <0,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2131935232U, // <0,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0 + 2131943424U, // <0,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0 + 2131951616U, // <0,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 2131959808U, // <0,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0 + 1058226176U, // <0,5,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <0,5,6,u>: Cost 1 ins RHS, lane 0 + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 2086223873U, // <0,5,7,3>: Cost 2 ins <0,u,7,3>, lane 1 + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2132041728U, // <0,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0 + 2132041728U, // <0,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0 + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2085765121U, // <0,5,u,1>: Cost 2 ins <0,u,1,1>, lane 1 + 2085699585U, // <0,5,u,2>: Cost 2 ins <0,u,0,2>, lane 1 + 1012113409U, // <0,5,u,3>: Cost 1 ins LHS, lane 1 + 1817423814U, // <0,5,u,4>: Cost 2 vzipl LHS, <5,4,7,6> + 2085797889U, // <0,5,u,5>: Cost 2 ins <0,u,1,5>, lane 1 + 2085879809U, // <0,5,u,6>: Cost 2 ins <0,u,2,6>, lane 1 + 1058226176U, // <0,5,u,7>: Cost 1 ins RHS, lane 0 + 1012113409U, // <0,5,u,u>: Cost 1 ins LHS, lane 1 + 2085683201U, // <0,6,0,0>: Cost 2 ins <0,u,0,0>, lane 1 + 2085691393U, // <0,6,0,1>: Cost 2 ins <0,u,0,1>, lane 1 + 2132148224U, // <0,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0 + 2085707777U, // <0,6,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 1678852234U, // <0,6,0,6>: Cost 2 vuzpl <0,1,6,3>, <0,1,6,3> + 1879051574U, // <0,6,0,7>: Cost 2 vzipr <0,0,0,0>, RHS + 2132148224U, // <0,6,0,u>: Cost 2 ins <u,6,0,2>, lane 0 + 2993278336U, // <0,6,1,0>: Cost 3 vzipr <6,7,0,1>, <4,6,6,0> + 2085765121U, // <0,6,1,1>: Cost 2 ins <0,u,1,1>, lane 1 + 1812779514U, // <0,6,1,2>: Cost 2 vzipl LHS, <6,2,7,3> + 2085781505U, // <0,6,1,3>: Cost 2 ins <0,u,1,3>, lane 1 + 3159531521U, // <0,6,1,4>: Cost 3 ins <0,u,1,4>, lane 1 + 2085797889U, // <0,6,1,5>: Cost 2 ins <0,u,1,5>, lane 1 + 1812779832U, // <0,6,1,6>: Cost 2 vzipl LHS, <6,6,6,6> + 1892994358U, // <0,6,1,7>: Cost 2 vzipr <2,3,0,1>, RHS + 1892994359U, // <0,6,1,u>: Cost 2 vzipr <2,3,0,1>, RHS + 1946997582U, // <0,6,2,0>: Cost 2 vtrnl LHS, <6,7,0,1> + 2085838849U, // <0,6,2,1>: Cost 2 ins <0,u,2,1>, lane 1 + 2085847041U, // <0,6,2,2>: Cost 2 ins <0,u,2,2>, lane 1 + 1012113409U, // <0,6,2,3>: Cost 1 ins LHS, lane 1 + 1946997622U, // <0,6,2,4>: Cost 2 vtrnl LHS, <6,7,4,5> + 2085871617U, // <0,6,2,5>: Cost 2 ins <0,u,2,5>, lane 1 + 2085879809U, // <0,6,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 1880395062U, // <0,6,2,7>: Cost 2 vzipr <0,2,0,2>, RHS + 1012113409U, // <0,6,2,u>: Cost 1 ins LHS, lane 1 + 3122942050U, // <0,6,3,0>: Cost 3 vtrnr <6,0,1,3>, <5,6,7,0> + 2250527010U, // <0,6,3,1>: Cost 3 vrev <6,0,1,3> + 3206111232U, // <0,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0 + 2085928961U, // <0,6,3,3>: Cost 2 ins <0,u,3,3>, lane 1 + 3206127616U, // <0,6,3,4>: Cost 3 ins <u,6,3,4>, lane 0 + 3206135808U, // <0,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0 + 3206144000U, // <0,6,3,6>: Cost 3 ins <u,6,3,6>, lane 0 + 2132410368U, // <0,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0 + 2132410368U, // <0,6,3,u>: Cost 2 ins <u,6,3,7>, lane 0 + 2888536380U, // <0,6,4,0>: Cost 3 vzipl <0,4,2,6>, <6,0,4,2> + 3021574433U, // <0,6,4,1>: Cost 3 vtrnl <0,2,4,6>, <6,0,1,2> + 3021574444U, // <0,6,4,2>: Cost 3 vtrnl <0,2,4,6>, <6,0,2,4> + 2086002689U, // <0,6,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2086019073U, // <0,6,4,5>: Cost 2 ins <0,u,4,5>, lane 1 + 2132475904U, // <0,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0 + 2954153270U, // <0,6,4,7>: Cost 3 vzipr <0,2,0,4>, RHS + 2132475904U, // <0,6,4,u>: Cost 2 ins <u,6,4,6>, lane 0 + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3206250496U, // <0,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0 + 3206258688U, // <0,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0 + 2086076417U, // <0,6,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 3206275072U, // <0,6,5,4>: Cost 3 ins <u,6,5,4>, lane 0 + 3206283264U, // <0,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0 + 3206291456U, // <0,6,5,6>: Cost 3 ins <u,6,5,6>, lane 0 + 2961460534U, // <0,6,5,7>: Cost 3 vzipr <1,4,0,5>, RHS + 2086076417U, // <0,6,5,u>: Cost 2 ins <0,u,5,3>, lane 1 + 2724172540U, // <0,6,6,0>: Cost 3 vext3 <6,6,0,0>, <6,6,0,0> + 2889838972U, // <0,6,6,1>: Cost 3 vzipl <0,6,2,3>, <6,1,2,3> + 2997300124U, // <0,6,6,2>: Cost 3 vzipr <7,4,0,6>, <4,0,6,2> + 2086150145U, // <0,6,6,3>: Cost 2 ins <0,u,6,3>, lane 1 + 3206348800U, // <0,6,6,4>: Cost 3 ins <u,6,6,4>, lane 0 + 2889839336U, // <0,6,6,5>: Cost 3 vzipl <0,6,2,3>, <6,5,6,7> + 2132623360U, // <0,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0 + 2132631552U, // <0,6,6,7>: Cost 2 ins <u,6,6,7>, lane 0 + 2086150145U, // <0,6,6,u>: Cost 2 ins <0,u,6,3>, lane 1 + 2132647936U, // <0,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0 + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3206406144U, // <0,6,7,2>: Cost 3 ins <u,6,7,2>, lane 0 + 2086223873U, // <0,6,7,3>: Cost 2 ins <0,u,7,3>, lane 1 + 2132680704U, // <0,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0 + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3206438912U, // <0,6,7,6>: Cost 3 ins <u,6,7,6>, lane 0 + 2132705280U, // <0,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0 + 2132647936U, // <0,6,7,u>: Cost 2 ins <u,6,7,0>, lane 0 + 2132647936U, // <0,6,u,0>: Cost 2 ins <u,6,7,0>, lane 0 + 2085765121U, // <0,6,u,1>: Cost 2 ins <0,u,1,1>, lane 1 + 2132148224U, // <0,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0 + 1012113409U, // <0,6,u,3>: Cost 1 ins LHS, lane 1 + 2132680704U, // <0,6,u,4>: Cost 2 ins <u,6,7,4>, lane 0 + 2085797889U, // <0,6,u,5>: Cost 2 ins <0,u,1,5>, lane 1 + 2085879809U, // <0,6,u,6>: Cost 2 ins <0,u,2,6>, lane 1 + 1880444214U, // <0,6,u,7>: Cost 2 vzipr <0,2,0,u>, RHS + 1012113409U, // <0,6,u,u>: Cost 1 ins LHS, lane 1 + 2085683201U, // <0,7,0,0>: Cost 2 ins <0,u,0,0>, lane 1 + 2132803584U, // <0,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0 + 2085699585U, // <0,7,0,2>: Cost 2 ins <0,u,0,2>, lane 1 + 2085707777U, // <0,7,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 2580516150U, // <0,7,0,4>: Cost 3 vext1 <5,0,7,0>, RHS + 2580516476U, // <0,7,0,5>: Cost 3 vext1 <5,0,7,0>, <5,0,7,0> + 2586489173U, // <0,7,0,6>: Cost 3 vext1 <6,0,7,0>, <6,0,7,0> + 1678925971U, // <0,7,0,7>: Cost 2 vuzpl <0,1,7,3>, <0,1,7,3> + 2132803584U, // <0,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0 + 1812780026U, // <0,7,1,0>: Cost 2 vzipl LHS, <7,0,1,2> + 2085765121U, // <0,7,1,1>: Cost 2 ins <0,u,1,1>, lane 1 + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 2132893696U, // <0,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0 + 1812780390U, // <0,7,1,4>: Cost 2 vzipl LHS, <7,4,5,6> + 2085797889U, // <0,7,1,5>: Cost 2 ins <0,u,1,5>, lane 1 + 2586497366U, // <0,7,1,6>: Cost 3 vext1 <6,0,7,1>, <6,0,7,1> + 1812780652U, // <0,7,1,7>: Cost 2 vzipl LHS, <7,7,7,7> + 2085765121U, // <0,7,1,u>: Cost 2 ins <0,u,1,1>, lane 1 + 2085830657U, // <0,7,2,0>: Cost 2 ins <0,u,2,0>, lane 1 + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 2085847041U, // <0,7,2,2>: Cost 2 ins <0,u,2,2>, lane 1 + 1012113409U, // <0,7,2,3>: Cost 1 ins LHS, lane 1 + 2085863425U, // <0,7,2,4>: Cost 2 ins <0,u,2,4>, lane 1 + 1946998118U, // <0,7,2,5>: Cost 2 vtrnl LHS, <7,4,5,6> + 2085879809U, // <0,7,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 1946998380U, // <0,7,2,7>: Cost 2 vtrnl LHS, <7,7,7,7> + 1012113409U, // <0,7,2,u>: Cost 1 ins LHS, lane 1 + 2989314146U, // <0,7,3,0>: Cost 3 vzipr <6,1,0,3>, <5,6,7,0> + 3206766592U, // <0,7,3,1>: Cost 3 ins <u,7,3,1>, lane 0 + 3020813397U, // <0,7,3,2>: Cost 3 vtrnl <0,1,3,3>, <7,1,2,3> + 2085928961U, // <0,7,3,3>: Cost 2 ins <0,u,3,3>, lane 1 + 3206791168U, // <0,7,3,4>: Cost 3 ins <u,7,3,4>, lane 0 + 3206799360U, // <0,7,3,5>: Cost 3 ins <u,7,3,5>, lane 0 + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3206815744U, // <0,7,3,7>: Cost 3 ins <u,7,3,7>, lane 0 + 2085928961U, // <0,7,3,u>: Cost 2 ins <0,u,3,3>, lane 1 + 3206832128U, // <0,7,4,0>: Cost 3 ins <u,7,4,0>, lane 0 + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 2086002689U, // <0,7,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 3206864896U, // <0,7,4,4>: Cost 3 ins <u,7,4,4>, lane 0 + 2133131264U, // <0,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0 + 2086027265U, // <0,7,4,6>: Cost 2 ins <0,u,4,6>, lane 1 + 3020887660U, // <0,7,4,7>: Cost 3 vtrnl <0,1,4,3>, <7,7,7,7> + 2133131264U, // <0,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0 + 2993311842U, // <0,7,5,0>: Cost 3 vzipr <6,7,0,5>, <5,6,7,0> + 3206914048U, // <0,7,5,1>: Cost 3 ins <u,7,5,1>, lane 0 + 3020960853U, // <0,7,5,2>: Cost 3 vtrnl <0,1,5,3>, <7,1,2,3> + 2086076417U, // <0,7,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3206946816U, // <0,7,5,5>: Cost 3 ins <u,7,5,5>, lane 0 + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2133221376U, // <0,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0 + 2133221376U, // <0,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0 + 2854834274U, // <0,7,6,0>: Cost 3 vuzpr <6,0,5,7>, <5,6,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3206995968U, // <0,7,6,2>: Cost 3 ins <u,7,6,2>, lane 0 + 2086150145U, // <0,7,6,3>: Cost 2 ins <0,u,6,3>, lane 1 + 3207012352U, // <0,7,6,4>: Cost 3 ins <u,7,6,4>, lane 0 + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3207028736U, // <0,7,6,6>: Cost 3 ins <u,7,6,6>, lane 0 + 2133295104U, // <0,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0 + 2086150145U, // <0,7,6,u>: Cost 2 ins <0,u,6,3>, lane 1 + 2992001122U, // <0,7,7,0>: Cost 3 vzipr <6,5,0,7>, <5,6,7,0> + 3207061504U, // <0,7,7,1>: Cost 3 ins <u,7,7,1>, lane 0 + 2752672853U, // <0,7,7,2>: Cost 3 vuzpl <0,1,7,3>, <7,1,2,3> + 2086223873U, // <0,7,7,3>: Cost 2 ins <0,u,7,3>, lane 1 + 3207086080U, // <0,7,7,4>: Cost 3 ins <u,7,7,4>, lane 0 + 3207094272U, // <0,7,7,5>: Cost 3 ins <u,7,7,5>, lane 0 + 2663093724U, // <0,7,7,6>: Cost 3 vext2 <7,6,0,7>, <7,6,0,7> + 2133368832U, // <0,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0 + 2086223873U, // <0,7,7,u>: Cost 2 ins <0,u,7,3>, lane 1 + 1817424890U, // <0,7,u,0>: Cost 2 vzipl LHS, <7,0,1,2> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2085699585U, // <0,7,u,2>: Cost 2 ins <0,u,0,2>, lane 1 + 1012113409U, // <0,7,u,3>: Cost 1 ins LHS, lane 1 + 1817425254U, // <0,7,u,4>: Cost 2 vzipl LHS, <7,4,5,6> + 2085797889U, // <0,7,u,5>: Cost 2 ins <0,u,1,5>, lane 1 + 2085879809U, // <0,7,u,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2133221376U, // <0,7,u,7>: Cost 2 ins <u,7,5,7>, lane 0 + 1012113409U, // <0,7,u,u>: Cost 1 ins LHS, lane 1 + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1007951877U, // <0,u,0,1>: Cost 1 ins LHS, lane 5 + 605257830U, // <0,u,0,2>: Cost 1 vuzpl LHS, LHS + 1007910914U, // <0,u,0,3>: Cost 1 ins LHS, lane 2 + 1678999756U, // <0,u,0,4>: Cost 2 vuzpl LHS, <0,2,4,6> + 2081767427U, // <0,u,0,5>: Cost 2 ins <0,2,0,u>, lane 3 + 1947506842U, // <0,u,0,6>: Cost 2 vtrnl <0,2,0,2>, RHS + 2081767427U, // <0,u,0,7>: Cost 2 ins <0,2,0,u>, lane 3 + 605257884U, // <0,u,0,u>: Cost 1 vuzpl LHS, LHS + 1812821715U, // <0,u,1,0>: Cost 2 vzipl LHS, <u,0,1,2> + 739039022U, // <0,u,1,1>: Cost 1 vzipl LHS, LHS + 1813264264U, // <0,u,1,2>: Cost 2 vzipl LHS, <u,2,3,3> + 1007910914U, // <0,u,1,3>: Cost 1 ins LHS, lane 2 + 1812822079U, // <0,u,1,4>: Cost 2 vzipl LHS, <u,4,5,6> + 739039386U, // <0,u,1,5>: Cost 1 vzipl LHS, RHS + 1813264592U, // <0,u,1,6>: Cost 2 vzipl LHS, <u,6,3,7> + 1892994376U, // <0,u,1,7>: Cost 2 vzipr <2,3,0,1>, RHS + 739039589U, // <0,u,1,u>: Cost 1 vzipl LHS, LHS + 1007509507U, // <0,u,2,0>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,u,2,1>: Cost 1 ins LHS, lane 3 + 873256750U, // <0,u,2,2>: Cost 1 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1007509507U, // <0,u,2,4>: Cost 1 ins LHS, lane 3 + 1007509507U, // <0,u,2,5>: Cost 1 ins LHS, lane 3 + 873257114U, // <0,u,2,6>: Cost 1 vtrnl LHS, RHS + 1007509507U, // <0,u,2,7>: Cost 1 ins LHS, lane 3 + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2133680132U, // <0,u,3,0>: Cost 2 ins <u,u,3,0>, lane 4 + 1679001750U, // <0,u,3,1>: Cost 2 vuzpl LHS, <3,0,1,2> + 2128388096U, // <0,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 1007910914U, // <0,u,3,3>: Cost 1 ins LHS, lane 2 + 2133712900U, // <0,u,3,4>: Cost 2 ins <u,u,3,4>, lane 4 + 1679002114U, // <0,u,3,5>: Cost 2 vuzpl LHS, <3,4,5,6> + 2082340866U, // <0,u,3,6>: Cost 2 ins <0,2,u,6>, lane 2 + 2133737476U, // <0,u,3,7>: Cost 2 ins <u,u,3,7>, lane 4 + 1007910914U, // <0,u,3,u>: Cost 1 ins LHS, lane 2 + 2082062339U, // <0,u,4,0>: Cost 2 ins <0,2,4,u>, lane 3 + 1814714158U, // <0,u,4,1>: Cost 2 vzipl <0,4,1,5>, LHS + 1947834158U, // <0,u,4,2>: Cost 2 vtrnl <0,2,4,6>, LHS + 1007910914U, // <0,u,4,3>: Cost 1 ins LHS, lane 2 + 1947828428U, // <0,u,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6> + 1007951877U, // <0,u,4,5>: Cost 1 ins LHS, lane 5 + 605261110U, // <0,u,4,6>: Cost 1 vuzpl LHS, RHS + 2082062339U, // <0,u,4,7>: Cost 2 ins <0,2,4,u>, lane 3 + 605261128U, // <0,u,4,u>: Cost 1 vuzpl LHS, RHS + 2080964610U, // <0,u,5,0>: Cost 2 ins <0,0,u,0>, lane 2 + 2128527360U, // <0,u,5,1>: Cost 2 ins <u,0,5,1>, lane 0 + 2080980994U, // <0,u,5,2>: Cost 2 ins <0,0,u,2>, lane 2 + 1007910914U, // <0,u,5,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <0,u,5,4>: Cost 2 ins <0,1,u,4>, lane 2 + 2133868548U, // <0,u,5,5>: Cost 2 ins <u,u,5,5>, lane 4 + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 1751092534U, // <0,u,5,7>: Cost 2 vuzpr <1,0,3,u>, RHS + 1007910914U, // <0,u,5,u>: Cost 1 ins LHS, lane 2 + 1679004494U, // <0,u,6,0>: Cost 2 vuzpl LHS, <6,7,0,1> + 2080972802U, // <0,u,6,1>: Cost 2 ins <0,0,u,1>, lane 2 + 2128609280U, // <0,u,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 1007910914U, // <0,u,6,3>: Cost 1 ins LHS, lane 2 + 1679004534U, // <0,u,6,4>: Cost 2 vuzpl LHS, <6,7,4,5> + 2083659778U, // <0,u,6,5>: Cost 2 ins <0,4,u,5>, lane 2 + 2133950468U, // <0,u,6,6>: Cost 2 ins <u,u,6,6>, lane 4 + 1060216836U, // <0,u,6,7>: Cost 1 ins RHS, lane 4 + 1007910914U, // <0,u,6,u>: Cost 1 ins LHS, lane 2 + 2133975044U, // <0,u,7,0>: Cost 2 ins <u,u,7,0>, lane 4 + 2080972802U, // <0,u,7,1>: Cost 2 ins <0,0,u,1>, lane 2 + 2080980994U, // <0,u,7,2>: Cost 2 ins <0,0,u,2>, lane 2 + 1007910914U, // <0,u,7,3>: Cost 1 ins LHS, lane 2 + 2134007812U, // <0,u,7,4>: Cost 2 ins <u,u,7,4>, lane 4 + 2083659778U, // <0,u,7,5>: Cost 2 ins <0,4,u,5>, lane 2 + 2134024196U, // <0,u,7,6>: Cost 2 ins <u,u,7,6>, lane 4 + 2134032388U, // <0,u,7,7>: Cost 2 ins <u,u,7,7>, lane 4 + 1007910914U, // <0,u,7,u>: Cost 1 ins LHS, lane 2 + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 743683886U, // <0,u,u,1>: Cost 1 vzipl LHS, LHS + 605263662U, // <0,u,u,2>: Cost 1 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1007509507U, // <0,u,u,4>: Cost 1 ins LHS, lane 3 + 743684250U, // <0,u,u,5>: Cost 1 vzipl LHS, RHS + 605264026U, // <0,u,u,6>: Cost 1 vuzpl LHS, RHS + 1007509507U, // <0,u,u,7>: Cost 1 ins LHS, lane 3 + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2128150528U, // <1,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0 + 1818148966U, // <1,0,0,1>: Cost 2 vzipl <1,0,3,2>, LHS + 2086952962U, // <1,0,0,2>: Cost 2 ins <1,0,u,2>, lane 2 + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2891891026U, // <1,0,0,4>: Cost 3 vzipl <1,0,3,2>, <0,4,1,5> + 3165437953U, // <1,0,0,5>: Cost 3 ins <1,u,0,5>, lane 1 + 3160154115U, // <1,0,0,6>: Cost 3 ins <1,0,0,u>, lane 3 + 3160154115U, // <1,0,0,7>: Cost 3 ins <1,0,0,u>, lane 3 + 1818149533U, // <1,0,0,u>: Cost 2 vzipl <1,0,3,2>, LHS + 1141522514U, // <1,0,1,0>: Cost 2 vrev <0,1,0,1> + 1818656870U, // <1,0,1,1>: Cost 2 vzipl <1,1,1,1>, LHS + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 2091753473U, // <1,0,1,3>: Cost 2 ins <1,u,1,3>, lane 1 + 1477070134U, // <1,0,1,4>: Cost 2 vext1 <0,1,0,1>, RHS + 2760770560U, // <1,0,1,5>: Cost 3 vuzpl <1,5,0,2>, <1,3,5,7> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3165528065U, // <1,0,1,7>: Cost 3 ins <1,u,1,7>, lane 1 + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 1819459686U, // <1,0,2,1>: Cost 2 vzipl <1,2,3,0>, LHS + 2128314368U, // <1,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0 + 2087002117U, // <1,0,2,3>: Cost 2 ins <1,0,u,u>, lane 5 + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 2970798548U, // <1,0,2,5>: Cost 3 vzipr <3,0,1,2>, <3,4,0,5> + 3165593601U, // <1,0,2,6>: Cost 3 ins <1,u,2,6>, lane 1 + 2592625730U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,1,0,2> + 1819460253U, // <1,0,2,u>: Cost 2 vzipl <1,2,3,0>, LHS + 2014101504U, // <1,0,3,0>: Cost 2 vtrnr LHS, <0,0,0,0> + 2014101514U, // <1,0,3,1>: Cost 2 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2091900929U, // <1,0,3,3>: Cost 2 ins <1,u,3,3>, lane 1 + 2091909121U, // <1,0,3,4>: Cost 2 ins <1,u,3,4>, lane 1 + 2086633475U, // <1,0,3,5>: Cost 2 ins <1,0,3,u>, lane 3 + 2086633475U, // <1,0,3,6>: Cost 2 ins <1,0,3,u>, lane 3 + 2091933697U, // <1,0,3,7>: Cost 2 ins <1,u,3,7>, lane 1 + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2667752338U, // <1,0,4,0>: Cost 3 vext2 <u,4,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2086952962U, // <1,0,4,2>: Cost 2 ins <1,0,u,2>, lane 2 + 2819383641U, // <1,0,4,3>: Cost 3 vuzpr <0,1,2,0>, <0,4,2,3> + 2894569810U, // <1,0,4,4>: Cost 3 vzipl <1,4,3,5>, <0,4,1,5> + 2087002117U, // <1,0,4,5>: Cost 2 ins <1,0,u,u>, lane 5 + 2758102326U, // <1,0,4,6>: Cost 3 vuzpl <1,1,0,0>, RHS + 2819386597U, // <1,0,4,7>: Cost 3 vuzpr <0,1,2,0>, <4,4,6,7> + 2086952962U, // <1,0,4,u>: Cost 2 ins <1,0,u,2>, lane 2 + 2955558912U, // <1,0,5,0>: Cost 3 vzipr <0,4,1,5>, <0,0,0,0> + 1821507686U, // <1,0,5,1>: Cost 2 vzipl <1,5,3,7>, LHS + 1954545766U, // <1,0,5,2>: Cost 2 vtrnl <1,3,5,7>, LHS + 3165790209U, // <1,0,5,3>: Cost 3 ins <1,u,5,3>, lane 1 + 1141850234U, // <1,0,5,4>: Cost 2 vrev <0,1,4,5> + 3165806593U, // <1,0,5,5>: Cost 3 ins <1,u,5,5>, lane 1 + 3202310144U, // <1,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0 + 2092081153U, // <1,0,5,7>: Cost 2 ins <1,u,5,7>, lane 1 + 1954545820U, // <1,0,5,u>: Cost 2 vtrnl <1,3,5,7>, LHS + 3202334720U, // <1,0,6,0>: Cost 3 ins <u,0,6,0>, lane 0 + 2895765606U, // <1,0,6,1>: Cost 3 vzipl <1,6,1,7>, LHS + 2128609280U, // <1,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 2819383803U, // <1,0,6,3>: Cost 3 vuzpr <0,1,2,0>, <0,6,2,3> + 2896060754U, // <1,0,6,4>: Cost 3 vzipl <1,6,5,7>, <0,4,1,5> + 2215673988U, // <1,0,6,5>: Cost 3 vrev <0,1,5,6> + 3165888513U, // <1,0,6,6>: Cost 3 ins <1,u,6,6>, lane 1 + 2087002117U, // <1,0,6,7>: Cost 2 ins <1,0,u,u>, lane 5 + 2128609280U, // <1,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0 + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 2974156454U, // <1,0,7,1>: Cost 3 vzipr <3,5,1,7>, <2,3,0,1> + 2086952962U, // <1,0,7,2>: Cost 2 ins <1,0,u,2>, lane 2 + 2861265024U, // <1,0,7,3>: Cost 3 vuzpr <7,1,3,0>, <5,7,1,3> + 3202441216U, // <1,0,7,4>: Cost 3 ins <u,0,7,4>, lane 0 + 3165954049U, // <1,0,7,5>: Cost 3 ins <1,u,7,5>, lane 1 + 1142014094U, // <1,0,7,6>: Cost 2 vrev <0,1,6,7> + 3165970433U, // <1,0,7,7>: Cost 3 ins <1,u,7,7>, lane 1 + 2086952962U, // <1,0,7,u>: Cost 2 ins <1,0,u,2>, lane 2 + 2014142464U, // <1,0,u,0>: Cost 2 vtrnr LHS, <0,0,0,0> + 2014142474U, // <1,0,u,1>: Cost 2 vtrnr LHS, <0,0,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2091753473U, // <1,0,u,3>: Cost 2 ins <1,u,1,3>, lane 1 + 2091909121U, // <1,0,u,4>: Cost 2 ins <1,u,3,4>, lane 1 + 2086633475U, // <1,0,u,5>: Cost 2 ins <1,0,3,u>, lane 3 + 2086633475U, // <1,0,u,6>: Cost 2 ins <1,0,3,u>, lane 3 + 2091933697U, // <1,0,u,7>: Cost 2 ins <1,u,3,7>, lane 1 + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1818149622U, // <1,1,0,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 1684439142U, // <1,1,0,2>: Cost 2 vuzpl <1,1,1,1>, LHS + 2087624706U, // <1,1,0,3>: Cost 2 ins <1,1,u,3>, lane 2 + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2891891856U, // <1,1,0,5>: Cost 3 vzipl <1,0,3,2>, <1,5,3,7> + 3161391106U, // <1,1,0,6>: Cost 3 ins <1,1,u,6>, lane 2 + 3161399298U, // <1,1,0,7>: Cost 3 ins <1,1,u,7>, lane 2 + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2087149571U, // <1,1,1,2>: Cost 2 ins <1,1,1,u>, lane 3 + 1751548006U, // <1,1,1,3>: Cost 2 vuzpr <1,1,1,1>, LHS + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2087149571U, // <1,1,1,5>: Cost 2 ins <1,1,1,u>, lane 3 + 2087149571U, // <1,1,1,6>: Cost 2 ins <1,1,1,u>, lane 3 + 2087149571U, // <1,1,1,7>: Cost 2 ins <1,1,1,u>, lane 3 + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2128961536U, // <1,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0 + 2128969728U, // <1,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 1819460502U, // <1,1,2,2>: Cost 2 vzipl <1,2,3,0>, <1,2,3,0> + 1055244288U, // <1,1,2,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <1,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0 + 2129002496U, // <1,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2129010688U, // <1,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2129018880U, // <1,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0 + 1055244288U, // <1,1,2,u>: Cost 1 ins LHS, lane 0 + 2091876353U, // <1,1,3,0>: Cost 2 ins <1,u,3,0>, lane 1 + 2014102324U, // <1,1,3,1>: Cost 2 vtrnr LHS, <1,1,1,1> + 2091892737U, // <1,1,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 940359782U, // <1,1,3,3>: Cost 1 vtrnr LHS, LHS + 2091909121U, // <1,1,3,4>: Cost 2 ins <1,u,3,4>, lane 1 + 2087297027U, // <1,1,3,5>: Cost 2 ins <1,1,3,u>, lane 3 + 2087297027U, // <1,1,3,6>: Cost 2 ins <1,1,3,u>, lane 3 + 2091933697U, // <1,1,3,7>: Cost 2 ins <1,u,3,7>, lane 1 + 940359787U, // <1,1,3,u>: Cost 1 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2087608322U, // <1,1,4,1>: Cost 2 ins <1,1,u,1>, lane 2 + 2894496662U, // <1,1,4,2>: Cost 3 vzipl <1,4,2,5>, <1,2,3,0> + 2087624706U, // <1,1,4,3>: Cost 2 ins <1,1,u,3>, lane 2 + 2014109799U, // <1,1,4,4>: Cost 2 vtrnr <0,1,2,4>, <0,1,2,4> + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 1684442422U, // <1,1,4,6>: Cost 2 vuzpl <1,1,1,1>, RHS + 3161399298U, // <1,1,4,7>: Cost 3 ins <1,1,u,7>, lane 2 + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 3028288624U, // <1,1,5,0>: Cost 3 vtrnl <1,3,5,7>, <1,5,0,2> + 2087608322U, // <1,1,5,1>: Cost 2 ins <1,1,u,1>, lane 2 + 2955561110U, // <1,1,5,2>: Cost 3 vzipr <0,4,1,5>, <3,0,1,2> + 2087624706U, // <1,1,5,3>: Cost 2 ins <1,1,u,3>, lane 2 + 2955558925U, // <1,1,5,4>: Cost 3 vzipr <0,4,1,5>, <0,0,1,4> + 1881817426U, // <1,1,5,5>: Cost 2 vzipr <0,4,1,5>, <0,4,1,5> + 2670415970U, // <1,1,5,6>: Cost 3 vext2 <u,u,1,1>, <5,6,7,0> + 1751551286U, // <1,1,5,7>: Cost 2 vuzpr <1,1,1,1>, RHS + 1751551287U, // <1,1,5,u>: Cost 2 vuzpr <1,1,1,1>, RHS + 3165839361U, // <1,1,6,0>: Cost 3 ins <1,u,6,0>, lane 1 + 2087608322U, // <1,1,6,1>: Cost 2 ins <1,1,u,1>, lane 2 + 2973485206U, // <1,1,6,2>: Cost 3 vzipr <3,4,1,6>, <3,0,1,2> + 2087624706U, // <1,1,6,3>: Cost 2 ins <1,1,u,3>, lane 2 + 2221572948U, // <1,1,6,4>: Cost 3 vrev <1,1,4,6> + 2955567442U, // <1,1,6,5>: Cost 3 vzipr <0,4,1,6>, <0,4,1,5> + 2014126185U, // <1,1,6,6>: Cost 2 vtrnr <0,1,2,6>, <0,1,2,6> + 2087665669U, // <1,1,6,7>: Cost 2 ins <1,1,u,u>, lane 5 + 2087624706U, // <1,1,6,u>: Cost 2 ins <1,1,u,3>, lane 2 + 2670416890U, // <1,1,7,0>: Cost 3 vext2 <u,u,1,1>, <7,0,1,2> + 2087608322U, // <1,1,7,1>: Cost 2 ins <1,1,u,1>, lane 2 + 3203088384U, // <1,1,7,2>: Cost 3 ins <u,1,7,2>, lane 0 + 2129354752U, // <1,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0 + 2670417254U, // <1,1,7,4>: Cost 3 vext2 <u,u,1,1>, <7,4,5,6> + 2221654878U, // <1,1,7,5>: Cost 3 vrev <1,1,5,7> + 3161391106U, // <1,1,7,6>: Cost 3 ins <1,1,u,6>, lane 2 + 2014134378U, // <1,1,7,7>: Cost 2 vtrnr <0,1,2,7>, <0,1,2,7> + 2129354752U, // <1,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0 + 1818149622U, // <1,1,u,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2> + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 1684444974U, // <1,1,u,2>: Cost 2 vuzpl <1,1,1,1>, LHS + 940400742U, // <1,1,u,3>: Cost 1 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 1684445338U, // <1,1,u,6>: Cost 2 vuzpl <1,1,1,1>, RHS + 1751551529U, // <1,1,u,7>: Cost 2 vuzpr <1,1,1,1>, RHS + 940400747U, // <1,1,u,u>: Cost 1 vtrnr LHS, LHS + 2088263682U, // <1,2,0,0>: Cost 2 ins <1,2,u,0>, lane 2 + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2129494016U, // <1,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0 + 2954854502U, // <1,2,0,3>: Cost 3 vzipr <0,3,1,0>, LHS + 2088296450U, // <1,2,0,4>: Cost 2 ins <1,2,u,4>, lane 2 + 3165437953U, // <1,2,0,5>: Cost 3 ins <1,u,0,5>, lane 1 + 2891892666U, // <1,2,0,6>: Cost 3 vzipl <1,0,3,2>, <2,6,3,7> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2088263682U, // <1,2,1,0>: Cost 2 ins <1,2,u,0>, lane 2 + 2091737089U, // <1,2,1,1>: Cost 2 ins <1,u,1,1>, lane 1 + 1745657957U, // <1,2,1,2>: Cost 2 vuzpr <0,1,2,2>, <0,1,2,2> + 1884438630U, // <1,2,1,3>: Cost 2 vzipr <0,u,1,1>, LHS + 2088296450U, // <1,2,1,4>: Cost 2 ins <1,2,u,4>, lane 2 + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2958180700U, // <1,2,1,6>: Cost 3 vzipr <0,u,1,1>, <0,4,2,6> + 3165528065U, // <1,2,1,7>: Cost 3 ins <1,u,1,7>, lane 1 + 1884438635U, // <1,2,1,u>: Cost 2 vzipr <0,u,1,1>, LHS + 2088263682U, // <1,2,2,0>: Cost 2 ins <1,2,u,0>, lane 2 + 2893235754U, // <1,2,2,1>: Cost 3 vzipl <1,2,3,4>, <2,1,4,3> + 2129641472U, // <1,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0 + 1897054310U, // <1,2,2,3>: Cost 2 vzipr <3,0,1,2>, LHS + 2088296450U, // <1,2,2,4>: Cost 2 ins <1,2,u,4>, lane 2 + 3165585409U, // <1,2,2,5>: Cost 3 ins <1,u,2,5>, lane 1 + 2893203386U, // <1,2,2,6>: Cost 3 vzipl <1,2,3,0>, <2,6,3,7> + 2994684010U, // <1,2,2,7>: Cost 3 vzipr <7,0,1,2>, <0,1,2,7> + 1897054315U, // <1,2,2,u>: Cost 2 vzipr <3,0,1,2>, LHS + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 2014101708U, // <1,2,3,6>: Cost 2 vtrnr LHS, <0,2,4,6> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2088263682U, // <1,2,4,0>: Cost 2 ins <1,2,u,0>, lane 2 + 3162013698U, // <1,2,4,1>: Cost 3 ins <1,2,u,1>, lane 2 + 3162021890U, // <1,2,4,2>: Cost 3 ins <1,2,u,2>, lane 2 + 2954887270U, // <1,2,4,3>: Cost 3 vzipr <0,3,1,4>, LHS + 2088296450U, // <1,2,4,4>: Cost 2 ins <1,2,u,4>, lane 2 + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2129821696U, // <1,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0 + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2088263682U, // <1,2,5,0>: Cost 2 ins <1,2,u,0>, lane 2 + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 2955558932U, // <1,2,5,2>: Cost 3 vzipr <0,4,1,5>, <0,0,2,2> + 1881817190U, // <1,2,5,3>: Cost 2 vzipr <0,4,1,5>, LHS + 2088296450U, // <1,2,5,4>: Cost 2 ins <1,2,u,4>, lane 2 + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2955559260U, // <1,2,5,6>: Cost 3 vzipr <0,4,1,5>, <0,4,2,6> + 2092081153U, // <1,2,5,7>: Cost 2 ins <1,u,5,7>, lane 1 + 1881817195U, // <1,2,5,u>: Cost 2 vzipr <0,4,1,5>, LHS + 2088263682U, // <1,2,6,0>: Cost 2 ins <1,2,u,0>, lane 2 + 3162013698U, // <1,2,6,1>: Cost 3 ins <1,2,u,1>, lane 2 + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2954240102U, // <1,2,6,3>: Cost 3 vzipr <0,2,1,6>, LHS + 2088296450U, // <1,2,6,4>: Cost 2 ins <1,2,u,4>, lane 2 + 3162046466U, // <1,2,6,5>: Cost 3 ins <1,2,u,5>, lane 2 + 2895914938U, // <1,2,6,6>: Cost 3 vzipl <1,6,3,7>, <2,6,3,7> + 2088329221U, // <1,2,6,7>: Cost 2 ins <1,2,u,u>, lane 5 + 2088263682U, // <1,2,6,u>: Cost 2 ins <1,2,u,0>, lane 2 + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 3203743744U, // <1,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0 + 3203751936U, // <1,2,7,2>: Cost 3 ins <u,2,7,2>, lane 0 + 2130018304U, // <1,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0 + 2088296450U, // <1,2,7,4>: Cost 2 ins <1,2,u,4>, lane 2 + 3203776512U, // <1,2,7,5>: Cost 3 ins <u,2,7,5>, lane 0 + 3203784704U, // <1,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0 + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2014142668U, // <1,2,u,6>: Cost 2 vtrnr LHS, <0,2,4,6> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 1745666048U, // <1,3,0,0>: Cost 2 vuzpr LHS, <0,0,0,0> + 1746108426U, // <1,3,0,1>: Cost 2 vuzpr LHS, <0,0,1,1> + 1745666806U, // <1,3,0,2>: Cost 2 vuzpr LHS, <1,0,3,2> + 2088951810U, // <1,3,0,3>: Cost 2 ins <1,3,u,3>, lane 2 + 2819850253U, // <1,3,0,4>: Cost 3 vuzpr LHS, <0,0,1,4> + 2758984055U, // <1,3,0,5>: Cost 3 vuzpl <1,2,3,0>, <0,4,5,6> + 2867183658U, // <1,3,0,6>: Cost 3 vuzpr LHS, <0,0,4,6> + 2088984578U, // <1,3,0,7>: Cost 2 ins <1,3,u,7>, lane 2 + 1745668252U, // <1,3,0,u>: Cost 2 vuzpr LHS, <3,0,1,u> + 2088476675U, // <1,3,1,0>: Cost 2 ins <1,3,1,u>, lane 3 + 1745666868U, // <1,3,1,1>: Cost 2 vuzpr LHS, <1,1,1,1> + 2088476675U, // <1,3,1,2>: Cost 2 ins <1,3,1,u>, lane 3 + 671924326U, // <1,3,1,3>: Cost 1 vuzpr LHS, LHS + 2088476675U, // <1,3,1,4>: Cost 2 ins <1,3,1,u>, lane 3 + 2088476675U, // <1,3,1,5>: Cost 2 ins <1,3,1,u>, lane 3 + 2088476675U, // <1,3,1,6>: Cost 2 ins <1,3,1,u>, lane 3 + 2088984578U, // <1,3,1,7>: Cost 2 ins <1,3,u,7>, lane 2 + 671924331U, // <1,3,1,u>: Cost 1 vuzpr LHS, LHS + 1745666966U, // <1,3,2,0>: Cost 2 vuzpr LHS, <1,2,3,0> + 2819408044U, // <1,3,2,1>: Cost 3 vuzpr LHS, <0,2,1,1> + 1745666212U, // <1,3,2,2>: Cost 2 vuzpr LHS, <0,2,0,2> + 1746110066U, // <1,3,2,3>: Cost 2 vuzpr LHS, <2,2,3,3> + 1745666970U, // <1,3,2,4>: Cost 2 vuzpr LHS, <1,2,3,4> + 2819408066U, // <1,3,2,5>: Cost 3 vuzpr LHS, <0,2,3,5> + 1745666252U, // <1,3,2,6>: Cost 2 vuzpr LHS, <0,2,4,6> + 2088984578U, // <1,3,2,7>: Cost 2 ins <1,3,u,7>, lane 2 + 1745666218U, // <1,3,2,u>: Cost 2 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1745667750U, // <1,3,3,1>: Cost 2 vuzpr LHS, <2,3,0,1> + 2091892737U, // <1,3,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 1745667032U, // <1,3,3,3>: Cost 2 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 1745667790U, // <1,3,3,5>: Cost 2 vuzpr LHS, <2,3,4,5> + 2819408868U, // <1,3,3,6>: Cost 3 vuzpr LHS, <1,3,2,6> + 2014102528U, // <1,3,3,7>: Cost 2 vtrnr LHS, <1,3,5,7> + 1745667037U, // <1,3,3,u>: Cost 2 vuzpr LHS, <1,3,1,u> + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2759019375U, // <1,3,4,1>: Cost 3 vuzpl <1,2,3,4>, <4,0,1,2> + 2759019466U, // <1,3,4,2>: Cost 3 vuzpl <1,2,3,4>, <4,1,2,3> + 2088951810U, // <1,3,4,3>: Cost 2 ins <1,3,u,3>, lane 2 + 1793445072U, // <1,3,4,4>: Cost 2 vuzpr LHS, <4,4,4,4> + 1746108754U, // <1,3,4,5>: Cost 2 vuzpr LHS, <0,4,1,5> + 1745668610U, // <1,3,4,6>: Cost 2 vuzpr LHS, <3,4,5,6> + 2088984578U, // <1,3,4,7>: Cost 2 ins <1,3,u,7>, lane 2 + 1745668612U, // <1,3,4,u>: Cost 2 vuzpr LHS, <3,4,5,u> + 2088771587U, // <1,3,5,0>: Cost 2 ins <1,3,5,u>, lane 3 + 2088771587U, // <1,3,5,1>: Cost 2 ins <1,3,5,u>, lane 3 + 2088771587U, // <1,3,5,2>: Cost 2 ins <1,3,5,u>, lane 3 + 2088951810U, // <1,3,5,3>: Cost 2 ins <1,3,u,3>, lane 2 + 2088771587U, // <1,3,5,4>: Cost 2 ins <1,3,5,u>, lane 3 + 1793445892U, // <1,3,5,5>: Cost 2 vuzpr LHS, <5,5,5,5> + 2088771587U, // <1,3,5,6>: Cost 2 ins <1,3,5,u>, lane 3 + 671927606U, // <1,3,5,7>: Cost 1 vuzpr LHS, RHS + 671927607U, // <1,3,5,u>: Cost 1 vuzpr LHS, RHS + 1793445986U, // <1,3,6,0>: Cost 2 vuzpr LHS, <5,6,7,0> + 2867185561U, // <1,3,6,1>: Cost 3 vuzpr LHS, <2,6,0,1> + 1793445196U, // <1,3,6,2>: Cost 2 vuzpr LHS, <4,6,0,2> + 2088951810U, // <1,3,6,3>: Cost 2 ins <1,3,u,3>, lane 2 + 1793445990U, // <1,3,6,4>: Cost 2 vuzpr LHS, <5,6,7,4> + 2849642738U, // <1,3,6,5>: Cost 3 vuzpr <5,1,7,3>, <u,6,7,5> + 1793445236U, // <1,3,6,6>: Cost 2 vuzpr LHS, <4,6,4,6> + 1746110394U, // <1,3,6,7>: Cost 2 vuzpr LHS, <2,6,3,7> + 1746110395U, // <1,3,6,u>: Cost 2 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 1793446734U, // <1,3,7,1>: Cost 2 vuzpr LHS, <6,7,0,1> + 2867187830U, // <1,3,7,2>: Cost 3 vuzpr LHS, <5,7,0,2> + 1793446016U, // <1,3,7,3>: Cost 2 vuzpr LHS, <5,7,1,3> + 2849637679U, // <1,3,7,4>: Cost 3 vuzpr <5,1,7,3>, <1,7,3,4> + 1793446774U, // <1,3,7,5>: Cost 2 vuzpr LHS, <6,7,4,5> + 2867185674U, // <1,3,7,6>: Cost 3 vuzpr LHS, <2,7,3,6> + 1793446056U, // <1,3,7,7>: Cost 2 vuzpr LHS, <5,7,5,7> + 1793446021U, // <1,3,7,u>: Cost 2 vuzpr LHS, <5,7,1,u> + 1746109820U, // <1,3,u,0>: Cost 2 vuzpr LHS, <1,u,3,0> + 2014144166U, // <1,3,u,1>: Cost 2 vtrnr LHS, <2,3,0,1> + 1745668894U, // <1,3,u,2>: Cost 2 vuzpr LHS, <3,u,1,2> + 671924893U, // <1,3,u,3>: Cost 1 vuzpr LHS, LHS + 1746109824U, // <1,3,u,4>: Cost 2 vuzpr LHS, <1,u,3,4> + 2014144206U, // <1,3,u,5>: Cost 2 vtrnr LHS, <2,3,4,5> + 1745668934U, // <1,3,u,6>: Cost 2 vuzpr LHS, <3,u,5,6> + 671927849U, // <1,3,u,7>: Cost 1 vuzpr LHS, RHS + 671924898U, // <1,3,u,u>: Cost 1 vuzpr LHS, LHS + 3165396993U, // <1,4,0,0>: Cost 3 ins <1,u,0,0>, lane 1 + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2758434918U, // <1,4,0,2>: Cost 3 vuzpl <1,1,4,5>, LHS + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 3165429761U, // <1,4,0,4>: Cost 3 ins <1,u,0,4>, lane 1 + 1818152246U, // <1,4,0,5>: Cost 2 vzipl <1,0,3,2>, RHS + 3026537782U, // <1,4,0,6>: Cost 3 vtrnl <1,1,0,0>, RHS + 3162808323U, // <1,4,0,7>: Cost 3 ins <1,4,0,u>, lane 3 + 1818152489U, // <1,4,0,u>: Cost 2 vzipl <1,0,3,2>, RHS + 3204620288U, // <1,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0 + 2091737089U, // <1,4,1,1>: Cost 2 ins <1,u,1,1>, lane 1 + 3204636672U, // <1,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0 + 2091753473U, // <1,4,1,3>: Cost 2 ins <1,u,1,3>, lane 1 + 1745674343U, // <1,4,1,4>: Cost 2 vuzpr <0,1,2,4>, <0,1,2,4> + 1818660150U, // <1,4,1,5>: Cost 2 vzipl <1,1,1,1>, RHS + 1952877878U, // <1,4,1,6>: Cost 2 vtrnl <1,1,1,1>, RHS + 3165528065U, // <1,4,1,7>: Cost 3 ins <1,u,1,7>, lane 1 + 1818660393U, // <1,4,1,u>: Cost 2 vzipl <1,1,1,1>, RHS + 2893237103U, // <1,4,2,0>: Cost 3 vzipl <1,2,3,4>, <4,0,1,2> + 2893237194U, // <1,4,2,1>: Cost 3 vzipl <1,2,3,4>, <4,1,2,3> + 3165560833U, // <1,4,2,2>: Cost 3 ins <1,u,2,2>, lane 1 + 2130976768U, // <1,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0 + 2893237467U, // <1,4,2,4>: Cost 3 vzipl <1,2,3,4>, <4,4,5,6> + 1819462966U, // <1,4,2,5>: Cost 2 vzipl <1,2,3,0>, RHS + 2131001344U, // <1,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 3165601793U, // <1,4,2,7>: Cost 3 ins <1,u,2,7>, lane 1 + 1819463209U, // <1,4,2,u>: Cost 2 vzipl <1,2,3,0>, RHS + 2091876353U, // <1,4,3,0>: Cost 2 ins <1,u,3,0>, lane 1 + 3027454831U, // <1,4,3,1>: Cost 3 vtrnl <1,2,3,4>, <4,0,1,2> + 2091892737U, // <1,4,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 2091900929U, // <1,4,3,3>: Cost 2 ins <1,u,3,3>, lane 1 + 2061880528U, // <1,4,3,4>: Cost 2 vtrnr LHS, <4,4,4,4> + 2014101842U, // <1,4,3,5>: Cost 2 vtrnr LHS, <0,4,1,5> + 2014101852U, // <1,4,3,6>: Cost 2 vtrnr LHS, <0,4,2,6> + 2091933697U, // <1,4,3,7>: Cost 2 ins <1,u,3,7>, lane 1 + 2014101845U, // <1,4,3,u>: Cost 2 vtrnr LHS, <0,4,1,u> + 2557100134U, // <1,4,4,0>: Cost 3 vext1 <1,1,4,4>, LHS + 2557100882U, // <1,4,4,1>: Cost 3 vext1 <1,1,4,4>, <1,1,4,4> + 3165708289U, // <1,4,4,2>: Cost 3 ins <1,u,4,2>, lane 1 + 2819416409U, // <1,4,4,3>: Cost 3 vuzpr <0,1,2,4>, <0,4,2,3> + 2131132416U, // <1,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0 + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2758438198U, // <1,4,4,6>: Cost 3 vuzpl <1,1,4,5>, RHS + 2819419365U, // <1,4,4,7>: Cost 3 vuzpr <0,1,2,4>, <4,4,6,7> + 2131132416U, // <1,4,4,u>: Cost 2 ins <u,4,4,4>, lane 0 + 1477394554U, // <1,4,5,0>: Cost 2 vext1 <0,1,4,5>, <0,1,4,5> + 2955558949U, // <1,4,5,1>: Cost 3 vzipr <0,4,1,5>, <0,0,4,1> + 3204931584U, // <1,4,5,2>: Cost 3 ins <u,4,5,2>, lane 0 + 3165790209U, // <1,4,5,3>: Cost 3 ins <1,u,5,3>, lane 1 + 1477397814U, // <1,4,5,4>: Cost 2 vext1 <0,1,4,5>, RHS + 1821510966U, // <1,4,5,5>: Cost 2 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2092081153U, // <1,4,5,7>: Cost 2 ins <1,u,5,7>, lane 1 + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117268U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,4,6> + 3165855745U, // <1,4,6,2>: Cost 3 ins <1,u,6,2>, lane 1 + 2569062662U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,1,4,6> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 2895768886U, // <1,4,6,5>: Cost 3 vzipl <1,6,1,7>, RHS + 2131296256U, // <1,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0 + 2131304448U, // <1,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0 + 2131296256U, // <1,4,6,u>: Cost 2 ins <u,4,6,6>, lane 0 + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3165921281U, // <1,4,7,1>: Cost 3 ins <1,u,7,1>, lane 1 + 3205079040U, // <1,4,7,2>: Cost 3 ins <u,4,7,2>, lane 0 + 2861297792U, // <1,4,7,3>: Cost 3 vuzpr <7,1,3,4>, <5,7,1,3> + 2669778278U, // <1,4,7,4>: Cost 3 vext2 <u,7,1,4>, <7,4,5,6> + 3205103616U, // <1,4,7,5>: Cost 3 ins <u,4,7,5>, lane 0 + 2131369984U, // <1,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 3165970433U, // <1,4,7,7>: Cost 3 ins <1,u,7,7>, lane 1 + 2131369984U, // <1,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0 + 2091876353U, // <1,4,u,0>: Cost 2 ins <1,u,3,0>, lane 1 + 2091737089U, // <1,4,u,1>: Cost 2 ins <1,u,1,1>, lane 1 + 2091892737U, // <1,4,u,2>: Cost 2 ins <1,u,3,2>, lane 1 + 2091753473U, // <1,4,u,3>: Cost 2 ins <1,u,1,3>, lane 1 + 2061921488U, // <1,4,u,4>: Cost 2 vtrnr LHS, <4,4,4,4> + 2014142802U, // <1,4,u,5>: Cost 2 vtrnr LHS, <0,4,1,5> + 2014142812U, // <1,4,u,6>: Cost 2 vtrnr LHS, <0,4,2,6> + 2091933697U, // <1,4,u,7>: Cost 2 ins <1,u,3,7>, lane 1 + 2014142805U, // <1,4,u,u>: Cost 2 vtrnr LHS, <0,4,1,u> + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 1686110310U, // <1,5,0,2>: Cost 2 vuzpl <1,3,5,7>, LHS + 3163471875U, // <1,5,0,3>: Cost 3 ins <1,5,0,u>, lane 3 + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 3165437953U, // <1,5,0,5>: Cost 3 ins <1,u,0,5>, lane 1 + 3164045314U, // <1,5,0,6>: Cost 3 ins <1,5,u,6>, lane 2 + 2090311682U, // <1,5,0,7>: Cost 2 ins <1,5,u,7>, lane 2 + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2091737089U, // <1,5,1,1>: Cost 2 ins <1,u,1,1>, lane 1 + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2091753473U, // <1,5,1,3>: Cost 2 ins <1,u,1,3>, lane 1 + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 1686111232U, // <1,5,1,5>: Cost 2 vuzpl <1,3,5,7>, <1,3,5,7> + 2958181456U, // <1,5,1,6>: Cost 3 vzipr <0,u,1,1>, <1,4,5,6> + 2019986742U, // <1,5,1,7>: Cost 2 vtrnr <1,1,1,1>, RHS + 2019986743U, // <1,5,1,u>: Cost 2 vtrnr <1,1,1,1>, RHS + 2759853734U, // <1,5,2,0>: Cost 3 vuzpl <1,3,5,7>, <2,3,0,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2090319877U, // <1,5,2,3>: Cost 2 ins <1,5,u,u>, lane 5 + 2759853774U, // <1,5,2,4>: Cost 3 vuzpl <1,3,5,7>, <2,3,4,5> + 2994687194U, // <1,5,2,5>: Cost 3 vzipr <7,0,1,2>, <4,4,5,5> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 2090311682U, // <1,5,2,7>: Cost 2 ins <1,5,u,7>, lane 2 + 2090319877U, // <1,5,2,u>: Cost 2 ins <1,5,u,u>, lane 5 + 2091876353U, // <1,5,3,0>: Cost 2 ins <1,u,3,0>, lane 1 + 2089951235U, // <1,5,3,1>: Cost 2 ins <1,5,3,u>, lane 3 + 2091892737U, // <1,5,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 2091900929U, // <1,5,3,3>: Cost 2 ins <1,u,3,3>, lane 1 + 2091909121U, // <1,5,3,4>: Cost 2 ins <1,u,3,4>, lane 1 + 2061881348U, // <1,5,3,5>: Cost 2 vtrnr LHS, <5,5,5,5> + 2089951235U, // <1,5,3,6>: Cost 2 ins <1,5,3,u>, lane 3 + 940363062U, // <1,5,3,7>: Cost 1 vtrnr LHS, RHS + 940363063U, // <1,5,3,u>: Cost 1 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3164012546U, // <1,5,4,2>: Cost 3 ins <1,5,u,2>, lane 2 + 3163766787U, // <1,5,4,3>: Cost 3 ins <1,5,4,u>, lane 3 + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 1686113590U, // <1,5,4,6>: Cost 2 vuzpl <1,3,5,7>, RHS + 2090311682U, // <1,5,4,7>: Cost 2 ins <1,5,u,7>, lane 2 + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2955561954U, // <1,5,5,0>: Cost 3 vzipr <0,4,1,5>, <4,1,5,0> + 2955561874U, // <1,5,5,1>: Cost 3 vzipr <0,4,1,5>, <4,0,5,1> + 3165782017U, // <1,5,5,2>: Cost 3 ins <1,u,5,2>, lane 1 + 2955559851U, // <1,5,5,3>: Cost 3 vzipr <0,4,1,5>, <1,2,5,3> + 2955561958U, // <1,5,5,4>: Cost 3 vzipr <0,4,1,5>, <4,1,5,4> + 2131877888U, // <1,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0 + 2955561474U, // <1,5,5,6>: Cost 3 vzipr <0,4,1,5>, <3,4,5,6> + 2092081153U, // <1,5,5,7>: Cost 2 ins <1,u,5,7>, lane 1 + 2092081153U, // <1,5,5,u>: Cost 2 ins <1,u,5,7>, lane 1 + 2131910656U, // <1,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0 + 2131918848U, // <1,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2131927040U, // <1,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2131935232U, // <1,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0 + 2131943424U, // <1,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0 + 2131951616U, // <1,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 2131959808U, // <1,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0 + 1058226176U, // <1,5,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <1,5,6,u>: Cost 1 ins RHS, lane 0 + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199198U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,5,7> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2759857248U, // <1,5,7,3>: Cost 3 vuzpl <1,3,5,7>, <7,1,3,5> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2759857510U, // <1,5,7,5>: Cost 3 vuzpl <1,3,5,7>, <7,4,5,6> + 2593035086U, // <1,5,7,6>: Cost 3 vext1 <7,1,5,7>, <6,7,0,1> + 2132041728U, // <1,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0 + 2132041728U, // <1,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0 + 2091876353U, // <1,5,u,0>: Cost 2 ins <1,u,3,0>, lane 1 + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 1686116142U, // <1,5,u,2>: Cost 2 vuzpl <1,3,5,7>, LHS + 2091753473U, // <1,5,u,3>: Cost 2 ins <1,u,1,3>, lane 1 + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 1686116506U, // <1,5,u,6>: Cost 2 vuzpl <1,3,5,7>, RHS + 940404022U, // <1,5,u,7>: Cost 1 vtrnr LHS, RHS + 940404023U, // <1,5,u,u>: Cost 1 vtrnr LHS, RHS + 3205873664U, // <1,6,0,0>: Cost 3 ins <u,6,0,0>, lane 0 + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2132148224U, // <1,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0 + 3087819259U, // <1,6,0,3>: Cost 3 vtrnr <0,1,2,0>, <0,6,2,3> + 2620023123U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,6> + 3165437953U, // <1,6,0,5>: Cost 3 ins <1,u,0,5>, lane 1 + 3164708866U, // <1,6,0,6>: Cost 3 ins <1,6,u,6>, lane 2 + 2954857782U, // <1,6,0,7>: Cost 3 vzipr <0,3,1,0>, RHS + 2132148224U, // <1,6,0,u>: Cost 2 ins <u,6,0,2>, lane 0 + 3205947392U, // <1,6,1,0>: Cost 3 ins <u,6,1,0>, lane 0 + 2091737089U, // <1,6,1,1>: Cost 2 ins <1,u,1,1>, lane 1 + 3005959068U, // <1,6,1,2>: Cost 3 vzipr <u,u,1,1>, <4,0,6,2> + 2091753473U, // <1,6,1,3>: Cost 2 ins <1,u,1,3>, lane 1 + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3205988352U, // <1,6,1,5>: Cost 3 ins <u,6,1,5>, lane 0 + 1745690729U, // <1,6,1,6>: Cost 2 vuzpr <0,1,2,6>, <0,1,2,6> + 1884441910U, // <1,6,1,7>: Cost 2 vzipr <0,u,1,1>, RHS + 1884441911U, // <1,6,1,u>: Cost 2 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 2994687442U, // <1,6,2,1>: Cost 3 vzipr <7,0,1,2>, <4,7,6,1> + 2994686876U, // <1,6,2,2>: Cost 3 vzipr <7,0,1,2>, <4,0,6,2> + 2132303872U, // <1,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0 + 3206053888U, // <1,6,2,4>: Cost 3 ins <u,6,2,4>, lane 0 + 3165585409U, // <1,6,2,5>: Cost 3 ins <1,u,2,5>, lane 1 + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 1897057590U, // <1,6,2,7>: Cost 2 vzipr <3,0,1,2>, RHS + 1897057591U, // <1,6,2,u>: Cost 2 vzipr <3,0,1,2>, RHS + 2061881442U, // <1,6,3,0>: Cost 2 vtrnr LHS, <5,6,7,0> + 2987396400U, // <1,6,3,1>: Cost 3 vzipr <5,7,1,3>, <4,5,6,1> + 2061880652U, // <1,6,3,2>: Cost 2 vtrnr LHS, <4,6,0,2> + 2091900929U, // <1,6,3,3>: Cost 2 ins <1,u,3,3>, lane 1 + 2061881446U, // <1,6,3,4>: Cost 2 vtrnr LHS, <5,6,7,4> + 3118078194U, // <1,6,3,5>: Cost 3 vtrnr <5,1,7,3>, <u,6,7,5> + 2061880692U, // <1,6,3,6>: Cost 2 vtrnr LHS, <4,6,4,6> + 2014103482U, // <1,6,3,7>: Cost 2 vtrnr LHS, <2,6,3,7> + 2014103483U, // <1,6,3,u>: Cost 2 vtrnr LHS, <2,6,3,u> + 3206168576U, // <1,6,4,0>: Cost 3 ins <u,6,4,0>, lane 0 + 2761256201U, // <1,6,4,1>: Cost 3 vuzpl <1,5,6,7>, <4,5,1,7> + 3164676098U, // <1,6,4,2>: Cost 3 ins <1,6,u,2>, lane 2 + 3087852027U, // <1,6,4,3>: Cost 3 vtrnr <0,1,2,4>, <0,6,2,3> + 3206201344U, // <1,6,4,4>: Cost 3 ins <u,6,4,4>, lane 0 + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2132475904U, // <1,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0 + 2954890550U, // <1,6,4,7>: Cost 3 vzipr <0,3,1,4>, RHS + 2132475904U, // <1,6,4,u>: Cost 2 ins <u,6,4,6>, lane 0 + 3164659714U, // <1,6,5,0>: Cost 3 ins <1,6,u,0>, lane 2 + 3206250496U, // <1,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0 + 3003337628U, // <1,6,5,2>: Cost 3 vzipr <u,4,1,5>, <4,0,6,2> + 3165790209U, // <1,6,5,3>: Cost 3 ins <1,u,5,3>, lane 1 + 3206275072U, // <1,6,5,4>: Cost 3 ins <u,6,5,4>, lane 0 + 3206283264U, // <1,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0 + 3003337956U, // <1,6,5,6>: Cost 3 vzipr <u,4,1,5>, <4,4,6,6> + 1881820470U, // <1,6,5,7>: Cost 2 vzipr <0,4,1,5>, RHS + 1881820471U, // <1,6,5,u>: Cost 2 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 2557264742U, // <1,6,6,1>: Cost 3 vext1 <1,1,6,6>, <1,1,6,6> + 3165855745U, // <1,6,6,2>: Cost 3 ins <1,u,6,2>, lane 1 + 2819432955U, // <1,6,6,3>: Cost 3 vuzpr <0,1,2,6>, <0,6,2,3> + 3206348800U, // <1,6,6,4>: Cost 3 ins <u,6,6,4>, lane 0 + 3206356992U, // <1,6,6,5>: Cost 3 ins <u,6,6,5>, lane 0 + 2132623360U, // <1,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0 + 2132631552U, // <1,6,6,7>: Cost 2 ins <u,6,6,7>, lane 0 + 2132623360U, // <1,6,6,u>: Cost 2 ins <u,6,6,6>, lane 0 + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 3206406144U, // <1,6,7,2>: Cost 3 ins <u,6,7,2>, lane 0 + 3206414336U, // <1,6,7,3>: Cost 3 ins <u,6,7,3>, lane 0 + 2132680704U, // <1,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0 + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2725507979U, // <1,6,7,6>: Cost 3 vext3 <6,u,0,1>, <6,7,6,u> + 2132705280U, // <1,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0 + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2091737089U, // <1,6,u,1>: Cost 2 ins <1,u,1,1>, lane 1 + 2061921612U, // <1,6,u,2>: Cost 2 vtrnr LHS, <4,6,0,2> + 2091753473U, // <1,6,u,3>: Cost 2 ins <1,u,1,3>, lane 1 + 2061922406U, // <1,6,u,4>: Cost 2 vtrnr LHS, <5,6,7,4> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2061921652U, // <1,6,u,6>: Cost 2 vtrnr LHS, <4,6,4,6> + 2014144442U, // <1,6,u,7>: Cost 2 vtrnr LHS, <2,6,3,7> + 2014144443U, // <1,6,u,u>: Cost 2 vtrnr LHS, <2,6,3,u> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2132803584U, // <1,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0 + 3206553600U, // <1,7,0,2>: Cost 3 ins <u,7,0,2>, lane 0 + 2257286235U, // <1,7,0,3>: Cost 3 vrev <7,1,3,0> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3206578176U, // <1,7,0,5>: Cost 3 ins <u,7,0,5>, lane 0 + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 3165380610U, // <1,7,0,7>: Cost 3 ins <1,7,u,7>, lane 2 + 2132803584U, // <1,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0 + 2581184614U, // <1,7,1,0>: Cost 3 vext1 <5,1,7,1>, LHS + 2091737089U, // <1,7,1,1>: Cost 2 ins <1,u,1,1>, lane 1 + 3206627328U, // <1,7,1,2>: Cost 3 ins <u,7,1,2>, lane 0 + 2132893696U, // <1,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0 + 2581187894U, // <1,7,1,4>: Cost 3 vext1 <5,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 1745698922U, // <1,7,1,7>: Cost 2 vuzpr <0,1,2,7>, <0,1,2,7> + 2132893696U, // <1,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0 + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 2994687370U, // <1,7,2,1>: Cost 3 vzipr <7,0,1,2>, <4,6,7,1> + 3206701056U, // <1,7,2,2>: Cost 3 ins <u,7,2,2>, lane 0 + 2132967424U, // <1,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0 + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3206725632U, // <1,7,2,5>: Cost 3 ins <u,7,2,5>, lane 0 + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 2994688024U, // <1,7,2,7>: Cost 3 vzipr <7,0,1,2>, <5,5,7,7> + 2132967424U, // <1,7,2,u>: Cost 2 ins <u,7,2,3>, lane 0 + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2061882190U, // <1,7,3,1>: Cost 2 vtrnr LHS, <6,7,0,1> + 2091892737U, // <1,7,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 2061881472U, // <1,7,3,3>: Cost 2 vtrnr LHS, <5,7,1,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2061881512U, // <1,7,3,7>: Cost 2 vtrnr LHS, <5,7,5,7> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3165331458U, // <1,7,4,1>: Cost 3 ins <1,7,u,1>, lane 2 + 2644585539U, // <1,7,4,2>: Cost 3 vext2 <4,5,1,7>, <4,2,6,7> + 2257319007U, // <1,7,4,3>: Cost 3 vrev <7,1,3,4> + 3206864896U, // <1,7,4,4>: Cost 3 ins <u,7,4,4>, lane 0 + 2133131264U, // <1,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0 + 3206881280U, // <1,7,4,6>: Cost 3 ins <u,7,4,6>, lane 0 + 3165380610U, // <1,7,4,7>: Cost 3 ins <1,7,u,7>, lane 2 + 2133131264U, // <1,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0 + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 3028292602U, // <1,7,5,1>: Cost 3 vtrnl <1,3,5,7>, <7,0,1,2> + 3165782017U, // <1,7,5,2>: Cost 3 ins <1,u,5,2>, lane 1 + 3028292704U, // <1,7,5,3>: Cost 3 vtrnl <1,3,5,7>, <7,1,3,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 3028292966U, // <1,7,5,5>: Cost 3 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 2133221376U, // <1,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0 + 2133221376U, // <1,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0 + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3206995968U, // <1,7,6,2>: Cost 3 ins <u,7,6,2>, lane 0 + 3165347842U, // <1,7,6,3>: Cost 3 ins <1,7,u,3>, lane 2 + 2257409130U, // <1,7,6,4>: Cost 3 vrev <7,1,4,6> + 3207020544U, // <1,7,6,5>: Cost 3 ins <u,7,6,5>, lane 0 + 3207028736U, // <1,7,6,6>: Cost 3 ins <u,7,6,6>, lane 0 + 2133295104U, // <1,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0 + 2133295104U, // <1,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0 + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 2861470542U, // <1,7,7,1>: Cost 3 vuzpr <7,1,5,7>, <6,7,0,1> + 3165929473U, // <1,7,7,2>: Cost 3 ins <1,u,7,2>, lane 1 + 2998046416U, // <1,7,7,3>: Cost 3 vzipr <7,5,1,7>, <5,1,7,3> + 3207086080U, // <1,7,7,4>: Cost 3 ins <u,7,7,4>, lane 0 + 2257491060U, // <1,7,7,5>: Cost 3 vrev <7,1,5,7> + 3207102464U, // <1,7,7,6>: Cost 3 ins <u,7,7,6>, lane 0 + 2133368832U, // <1,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0 + 2133368832U, // <1,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0 + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2061923150U, // <1,7,u,1>: Cost 2 vtrnr LHS, <6,7,0,1> + 2091892737U, // <1,7,u,2>: Cost 2 ins <1,u,3,2>, lane 1 + 2061922432U, // <1,7,u,3>: Cost 2 vtrnr LHS, <5,7,1,3> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2061922472U, // <1,7,u,7>: Cost 2 vtrnr LHS, <5,7,5,7> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1745707008U, // <1,u,0,0>: Cost 2 vuzpr LHS, <0,0,0,0> + 1745707018U, // <1,u,0,1>: Cost 2 vuzpr LHS, <0,0,1,1> + 1745707028U, // <1,u,0,2>: Cost 2 vuzpr LHS, <0,0,2,2> + 2087624706U, // <1,u,0,3>: Cost 2 ins <1,1,u,3>, lane 2 + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1818155162U, // <1,u,0,5>: Cost 2 vzipl <1,0,3,2>, RHS + 2891897040U, // <1,u,0,6>: Cost 3 vzipl <1,0,3,2>, <u,6,3,7> + 2088984578U, // <1,u,0,7>: Cost 2 ins <1,3,u,7>, lane 2 + 1745707025U, // <1,u,0,u>: Cost 2 vuzpr LHS, <0,0,1,u> + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 671965286U, // <1,u,1,3>: Cost 1 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1818663066U, // <1,u,1,5>: Cost 2 vzipl <1,1,1,1>, RHS + 1952880794U, // <1,u,1,6>: Cost 2 vtrnl <1,1,1,1>, RHS + 1884441928U, // <1,u,1,7>: Cost 2 vzipr <0,u,1,1>, RHS + 671965291U, // <1,u,1,u>: Cost 1 vuzpr LHS, LHS + 1745707926U, // <1,u,2,0>: Cost 2 vuzpr LHS, <1,2,3,0> + 1819465518U, // <1,u,2,1>: Cost 2 vzipl <1,2,3,0>, LHS + 1745707172U, // <1,u,2,2>: Cost 2 vuzpr LHS, <0,2,0,2> + 1055244288U, // <1,u,2,3>: Cost 1 ins LHS, lane 0 + 1745707930U, // <1,u,2,4>: Cost 2 vuzpr LHS, <1,2,3,4> + 1819465882U, // <1,u,2,5>: Cost 2 vzipl <1,2,3,0>, RHS + 1745707212U, // <1,u,2,6>: Cost 2 vuzpr LHS, <0,2,4,6> + 1897057608U, // <1,u,2,7>: Cost 2 vzipr <3,0,1,2>, RHS + 1055244288U, // <1,u,2,u>: Cost 1 ins LHS, lane 0 + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 2014102162U, // <1,u,3,1>: Cost 2 vtrnr LHS, <0,u,1,1> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 940360349U, // <1,u,3,3>: Cost 1 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 2014102166U, // <1,u,3,5>: Cost 2 vtrnr LHS, <0,u,1,5> + 2014102176U, // <1,u,3,6>: Cost 2 vtrnr LHS, <0,u,2,6> + 940363305U, // <1,u,3,7>: Cost 1 vtrnr LHS, RHS + 940360354U, // <1,u,3,u>: Cost 1 vtrnr LHS, LHS + 2088263682U, // <1,u,4,0>: Cost 2 ins <1,2,u,0>, lane 2 + 2087608322U, // <1,u,4,1>: Cost 2 ins <1,1,u,1>, lane 2 + 2086952962U, // <1,u,4,2>: Cost 2 ins <1,0,u,2>, lane 2 + 2087624706U, // <1,u,4,3>: Cost 2 ins <1,1,u,3>, lane 2 + 1793486032U, // <1,u,4,4>: Cost 2 vuzpr LHS, <4,4,4,4> + 1745707346U, // <1,u,4,5>: Cost 2 vuzpr LHS, <0,4,1,5> + 1745707356U, // <1,u,4,6>: Cost 2 vuzpr LHS, <0,4,2,6> + 2088984578U, // <1,u,4,7>: Cost 2 ins <1,3,u,7>, lane 2 + 1745707349U, // <1,u,4,u>: Cost 2 vuzpr LHS, <0,4,1,u> + 2088263682U, // <1,u,5,0>: Cost 2 ins <1,2,u,0>, lane 2 + 1821513518U, // <1,u,5,1>: Cost 2 vzipl <1,5,3,7>, LHS + 1954551598U, // <1,u,5,2>: Cost 2 vtrnl <1,3,5,7>, LHS + 1881817244U, // <1,u,5,3>: Cost 2 vzipr <0,4,1,5>, LHS + 2088296450U, // <1,u,5,4>: Cost 2 ins <1,2,u,4>, lane 2 + 1821513882U, // <1,u,5,5>: Cost 2 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 671968566U, // <1,u,5,7>: Cost 1 vuzpr LHS, RHS + 671968567U, // <1,u,5,u>: Cost 1 vuzpr LHS, RHS + 1793486946U, // <1,u,6,0>: Cost 2 vuzpr LHS, <5,6,7,0> + 2087608322U, // <1,u,6,1>: Cost 2 ins <1,1,u,1>, lane 2 + 1793486156U, // <1,u,6,2>: Cost 2 vuzpr LHS, <4,6,0,2> + 2087624706U, // <1,u,6,3>: Cost 2 ins <1,1,u,3>, lane 2 + 1793486950U, // <1,u,6,4>: Cost 2 vuzpr LHS, <5,6,7,4> + 2131951616U, // <1,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 1793486196U, // <1,u,6,6>: Cost 2 vuzpr LHS, <4,6,4,6> + 1058226176U, // <1,u,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <1,u,6,u>: Cost 1 ins RHS, lane 0 + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 1793487694U, // <1,u,7,1>: Cost 2 vuzpr LHS, <6,7,0,1> + 2086952962U, // <1,u,7,2>: Cost 2 ins <1,0,u,2>, lane 2 + 1793486976U, // <1,u,7,3>: Cost 2 vuzpr LHS, <5,7,1,3> + 2088296450U, // <1,u,7,4>: Cost 2 ins <1,2,u,4>, lane 2 + 1793487734U, // <1,u,7,5>: Cost 2 vuzpr LHS, <6,7,4,5> + 2131369984U, // <1,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 1793487016U, // <1,u,7,7>: Cost 2 vuzpr LHS, <5,7,5,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 671965853U, // <1,u,u,3>: Cost 1 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1745707670U, // <1,u,u,5>: Cost 2 vuzpr LHS, <0,u,1,5> + 1745707680U, // <1,u,u,6>: Cost 2 vuzpr LHS, <0,u,2,6> + 671968809U, // <1,u,u,7>: Cost 1 vuzpr LHS, RHS + 671965858U, // <1,u,u,u>: Cost 1 vuzpr LHS, LHS + 2128150528U, // <2,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0 + 2097635329U, // <2,0,0,1>: Cost 2 ins <2,u,0,1>, lane 1 + 1691664486U, // <2,0,0,2>: Cost 2 vuzpl <2,3,0,1>, LHS + 2826094014U, // <2,0,0,3>: Cost 3 vuzpr <1,2,3,0>, <2,0,1,3> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 2826094772U, // <2,0,0,5>: Cost 3 vuzpr <1,2,3,0>, <3,0,4,5> + 3171418113U, // <2,0,0,6>: Cost 3 ins <2,u,0,6>, lane 1 + 3094529510U, // <2,0,0,7>: Cost 3 vtrnr <1,2,3,0>, <2,0,5,7> + 1691664540U, // <2,0,0,u>: Cost 2 vuzpl <2,3,0,1>, LHS + 2215927971U, // <2,0,1,0>: Cost 3 vrev <0,2,0,1> + 2128232448U, // <2,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0 + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 1752350822U, // <2,0,1,3>: Cost 2 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 2765407232U, // <2,0,1,5>: Cost 3 vuzpl <2,3,0,1>, <1,3,5,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3166707714U, // <2,0,1,7>: Cost 3 ins <2,0,u,7>, lane 2 + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1142194340U, // <2,0,2,0>: Cost 2 vrev <0,2,0,2> + 1825374310U, // <2,0,2,1>: Cost 2 vzipl <2,2,2,2>, LHS + 1959592038U, // <2,0,2,2>: Cost 2 vtrnl <2,2,2,2>, LHS + 2128322560U, // <2,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0 + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2599259856U, // <2,0,2,5>: Cost 3 vext1 <u,2,0,2>, <5,1,7,3> + 3088351274U, // <2,0,2,6>: Cost 3 vtrnr <0,2,0,2>, <0,0,4,6> + 2599261178U, // <2,0,2,7>: Cost 3 vext1 <u,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 1879883776U, // <2,0,3,0>: Cost 2 vzipr LHS, <0,0,0,0> + 1879885478U, // <2,0,3,1>: Cost 2 vzipr LHS, <2,3,0,1> + 1879883940U, // <2,0,3,2>: Cost 2 vzipr LHS, <0,2,0,2> + 2097872897U, // <2,0,3,3>: Cost 2 ins <2,u,3,3>, lane 1 + 2958270630U, // <2,0,3,4>: Cost 3 vzipr LHS, <0,2,0,4> + 2826094286U, // <2,0,3,5>: Cost 3 vuzpr <1,2,3,0>, <2,3,4,5> + 2958270794U, // <2,0,3,6>: Cost 3 vzipr LHS, <0,4,0,6> + 2097905665U, // <2,0,3,7>: Cost 2 ins <2,u,3,7>, lane 1 + 1879883946U, // <2,0,3,u>: Cost 2 vzipr LHS, <0,2,0,u> + 2215952550U, // <2,0,4,0>: Cost 3 vrev <0,2,0,4> + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 1960427622U, // <2,0,4,2>: Cost 2 vtrnl <2,3,4,5>, LHS + 3171688449U, // <2,0,4,3>: Cost 3 ins <2,u,4,3>, lane 1 + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2097963009U, // <2,0,4,5>: Cost 2 ins <2,u,4,5>, lane 1 + 1691667766U, // <2,0,4,6>: Cost 2 vuzpl <2,3,0,1>, RHS + 3171721217U, // <2,0,4,7>: Cost 3 ins <2,u,4,7>, lane 1 + 1691667784U, // <2,0,4,u>: Cost 2 vuzpl <2,3,0,1>, RHS + 3033596068U, // <2,0,5,0>: Cost 3 vtrnl <2,2,5,7>, <0,2,0,2> + 2128527360U, // <2,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0 + 2955632804U, // <2,0,5,2>: Cost 3 vzipr <0,4,2,5>, <0,2,0,2> + 2216181954U, // <2,0,5,3>: Cost 3 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 2867900420U, // <2,0,5,5>: Cost 3 vuzpr <u,2,3,0>, <5,5,5,5> + 3202310144U, // <2,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0 + 1752354102U, // <2,0,5,7>: Cost 2 vuzpr <1,2,3,0>, RHS + 1752354103U, // <2,0,5,u>: Cost 2 vuzpr <1,2,3,0>, RHS + 3088678912U, // <2,0,6,0>: Cost 3 vtrnr <0,2,4,6>, <0,0,0,0> + 1828143206U, // <2,0,6,1>: Cost 2 vzipl <2,6,3,7>, LHS + 2128609280U, // <2,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 3171835905U, // <2,0,6,3>: Cost 3 ins <2,u,6,3>, lane 1 + 1142522060U, // <2,0,6,4>: Cost 2 vrev <0,2,4,6> + 3171852289U, // <2,0,6,5>: Cost 3 ins <2,u,6,5>, lane 1 + 2867899764U, // <2,0,6,6>: Cost 3 vuzpr <u,2,3,0>, <4,6,4,6> + 2128650240U, // <2,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0 + 1142817008U, // <2,0,6,u>: Cost 2 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 2867901262U, // <2,0,7,1>: Cost 3 vuzpr <u,2,3,0>, <6,7,0,1> + 2956976292U, // <2,0,7,2>: Cost 3 vzipr <0,6,2,7>, <0,2,0,2> + 2867900544U, // <2,0,7,3>: Cost 3 vuzpr <u,2,3,0>, <5,7,1,3> + 3171917825U, // <2,0,7,4>: Cost 3 ins <2,u,7,4>, lane 1 + 2867901302U, // <2,0,7,5>: Cost 3 vuzpr <u,2,3,0>, <6,7,4,5> + 3166699522U, // <2,0,7,6>: Cost 3 ins <2,0,u,6>, lane 2 + 2867900584U, // <2,0,7,7>: Cost 3 vuzpr <u,2,3,0>, <5,7,5,7> + 2867900549U, // <2,0,7,u>: Cost 3 vuzpr <u,2,3,0>, <5,7,1,u> + 1879924736U, // <2,0,u,0>: Cost 2 vzipr LHS, <0,0,0,0> + 1879926438U, // <2,0,u,1>: Cost 2 vzipr LHS, <2,3,0,1> + 1879924900U, // <2,0,u,2>: Cost 2 vzipr LHS, <0,2,0,2> + 1752351389U, // <2,0,u,3>: Cost 2 vuzpr <1,2,3,0>, LHS + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2097963009U, // <2,0,u,5>: Cost 2 ins <2,u,4,5>, lane 1 + 1691670682U, // <2,0,u,6>: Cost 2 vuzpl <2,3,0,1>, RHS + 1752354345U, // <2,0,u,7>: Cost 2 vuzpr <1,2,3,0>, RHS + 1879924906U, // <2,0,u,u>: Cost 2 vzipr LHS, <0,2,0,u> + 2763497636U, // <2,1,0,0>: Cost 3 vuzpl <2,0,1,2>, <0,2,0,2> + 2097635329U, // <2,1,0,1>: Cost 2 ins <2,u,0,1>, lane 1 + 2820130966U, // <2,1,0,2>: Cost 3 vuzpr <0,2,3,1>, <3,0,1,2> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2767487180U, // <2,1,0,4>: Cost 3 vuzpl <2,6,1,3>, <0,2,4,6> + 3033842688U, // <2,1,0,5>: Cost 3 vtrnl <2,3,0,1>, <1,3,5,7> + 3171418113U, // <2,1,0,6>: Cost 3 ins <2,u,0,6>, lane 1 + 3171426305U, // <2,1,0,7>: Cost 3 ins <2,u,0,7>, lane 1 + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551546028U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, <0,2,1,1> + 2128896000U, // <2,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0 + 2954938518U, // <2,1,1,2>: Cost 3 vzipr <0,3,2,1>, <3,0,1,2> + 2128912384U, // <2,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0 + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3202670592U, // <2,1,1,5>: Cost 3 ins <u,1,1,5>, lane 0 + 3202678784U, // <2,1,1,6>: Cost 3 ins <u,1,1,6>, lane 0 + 2953612553U, // <2,1,1,7>: Cost 3 vzipr <0,1,2,1>, <4,5,1,7> + 2128896000U, // <2,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0 + 2128961536U, // <2,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0 + 2128969728U, // <2,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 2128977920U, // <2,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0 + 1055244288U, // <2,1,2,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <2,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0 + 2129002496U, // <2,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2129010688U, // <2,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2129018880U, // <2,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0 + 1055244288U, // <2,1,2,u>: Cost 1 ins LHS, lane 0 + 2953625609U, // <2,1,3,0>: Cost 3 vzipr LHS, <0,0,1,0> + 1879883786U, // <2,1,3,1>: Cost 2 vzipr LHS, <0,0,1,1> + 1879885974U, // <2,1,3,2>: Cost 2 vzipr LHS, <3,0,1,2> + 1879884760U, // <2,1,3,3>: Cost 2 vzipr LHS, <1,3,1,3> + 2953625856U, // <2,1,3,4>: Cost 3 vzipr LHS, <0,3,1,4> + 1879884114U, // <2,1,3,5>: Cost 2 vzipr LHS, <0,4,1,5> + 2958270641U, // <2,1,3,6>: Cost 3 vzipr LHS, <0,2,1,6> + 2097905665U, // <2,1,3,7>: Cost 2 ins <2,u,3,7>, lane 1 + 1879883793U, // <2,1,3,u>: Cost 2 vzipr LHS, <0,0,1,u> + 3171663873U, // <2,1,4,0>: Cost 3 ins <2,u,4,0>, lane 1 + 3094561588U, // <2,1,4,1>: Cost 3 vtrnr <1,2,3,4>, <1,1,1,1> + 2900378522U, // <2,1,4,2>: Cost 3 vzipl <2,4,1,3>, <1,2,3,4> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3171696641U, // <2,1,4,4>: Cost 3 ins <2,u,4,4>, lane 1 + 2097963009U, // <2,1,4,5>: Cost 2 ins <2,u,4,5>, lane 1 + 2763500854U, // <2,1,4,6>: Cost 3 vuzpl <2,0,1,2>, RHS + 3171721217U, // <2,1,4,7>: Cost 3 ins <2,u,4,7>, lane 1 + 2020819051U, // <2,1,4,u>: Cost 2 vtrnr <1,2,3,4>, LHS + 2551578800U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, <0,2,1,5> + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 2901001110U, // <2,1,5,2>: Cost 3 vzipl <2,5,0,7>, <1,2,3,0> + 2129207296U, // <2,1,5,3>: Cost 2 ins <u,1,5,3>, lane 0 + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3202965504U, // <2,1,5,5>: Cost 3 ins <u,1,5,5>, lane 0 + 3171786753U, // <2,1,5,6>: Cost 3 ins <2,u,5,6>, lane 1 + 2819910966U, // <2,1,5,7>: Cost 3 vuzpr <0,2,0,1>, RHS + 2129207296U, // <2,1,5,u>: Cost 2 ins <u,1,5,3>, lane 0 + 2551586993U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, <0,2,1,6> + 3088679732U, // <2,1,6,1>: Cost 3 vtrnr <0,2,4,6>, <1,1,1,1> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 2014937190U, // <2,1,6,3>: Cost 2 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 2955641170U, // <2,1,6,5>: Cost 3 vzipr <0,4,2,6>, <0,4,1,5> + 2901886177U, // <2,1,6,6>: Cost 3 vzipl <2,6,3,7>, <1,6,3,7> + 2129313792U, // <2,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0 + 2014937195U, // <2,1,6,u>: Cost 2 vtrnr <0,2,4,6>, LHS + 3171885057U, // <2,1,7,0>: Cost 3 ins <2,u,7,0>, lane 1 + 3203080192U, // <2,1,7,1>: Cost 3 ins <u,1,7,1>, lane 0 + 3001439874U, // <2,1,7,2>: Cost 3 vzipr <u,1,2,7>, <7,u,1,2> + 2129354752U, // <2,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0 + 3171917825U, // <2,1,7,4>: Cost 3 ins <2,u,7,4>, lane 1 + 3203112960U, // <2,1,7,5>: Cost 3 ins <u,1,7,5>, lane 0 + 2222392248U, // <2,1,7,6>: Cost 3 vrev <1,2,6,7> + 3171942401U, // <2,1,7,7>: Cost 3 ins <2,u,7,7>, lane 1 + 2129354752U, // <2,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0 + 2128961536U, // <2,1,u,0>: Cost 2 ins <u,1,2,0>, lane 0 + 1879924746U, // <2,1,u,1>: Cost 2 vzipr LHS, <0,0,1,1> + 1879926934U, // <2,1,u,2>: Cost 2 vzipr LHS, <3,0,1,2> + 1055244288U, // <2,1,u,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <2,1,u,4>: Cost 2 ins <u,1,2,4>, lane 0 + 1879925074U, // <2,1,u,5>: Cost 2 vzipr LHS, <0,4,1,5> + 2129010688U, // <2,1,u,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2097905665U, // <2,1,u,7>: Cost 2 ins <2,u,3,7>, lane 1 + 1055244288U, // <2,1,u,u>: Cost 1 ins LHS, lane 0 + 2020787094U, // <2,2,0,0>: Cost 2 vtrnr <1,2,3,0>, <1,2,3,0> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 1691156582U, // <2,2,0,2>: Cost 2 vuzpl <2,2,2,2>, LHS + 2094260226U, // <2,2,0,3>: Cost 2 ins <2,2,u,3>, lane 2 + 2819917256U, // <2,2,0,4>: Cost 3 vuzpr <0,2,0,2>, <2,0,2,4> + 3168018434U, // <2,2,0,5>: Cost 3 ins <2,2,u,5>, lane 2 + 2819915818U, // <2,2,0,6>: Cost 3 vuzpr <0,2,0,2>, <0,0,4,6> + 3171426305U, // <2,2,0,7>: Cost 3 ins <2,u,0,7>, lane 1 + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 1879867492U, // <2,2,1,1>: Cost 2 vzipr <0,1,2,1>, <0,1,2,1> + 2094252034U, // <2,2,1,2>: Cost 2 ins <2,2,u,2>, lane 2 + 1746174054U, // <2,2,1,3>: Cost 2 vuzpr <0,2,0,2>, LHS + 3167526915U, // <2,2,1,4>: Cost 3 ins <2,2,1,u>, lane 3 + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3203342336U, // <2,2,1,6>: Cost 3 ins <u,2,1,6>, lane 0 + 3168034818U, // <2,2,1,7>: Cost 3 ins <2,2,u,7>, lane 2 + 1746174059U, // <2,2,1,u>: Cost 2 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2093858819U, // <2,2,2,1>: Cost 2 ins <2,2,2,u>, lane 3 + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 1884520550U, // <2,2,2,3>: Cost 2 vzipr <0,u,2,2>, LHS + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2093858819U, // <2,2,2,5>: Cost 2 ins <2,2,2,u>, lane 3 + 2093858819U, // <2,2,2,6>: Cost 2 ins <2,2,2,u>, lane 3 + 2093858819U, // <2,2,2,7>: Cost 2 ins <2,2,2,u>, lane 3 + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2129698816U, // <2,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0 + 2093932547U, // <2,2,3,1>: Cost 2 ins <2,2,3,u>, lane 3 + 1879885416U, // <2,2,3,2>: Cost 2 vzipr LHS, <2,2,2,2> + 806142054U, // <2,2,3,3>: Cost 1 vzipr LHS, LHS + 2129731584U, // <2,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0 + 2093932547U, // <2,2,3,5>: Cost 2 ins <2,2,3,u>, lane 3 + 1884528988U, // <2,2,3,6>: Cost 2 vzipr LHS, <0,4,2,6> + 2097905665U, // <2,2,3,7>: Cost 2 ins <2,u,3,7>, lane 1 + 806142059U, // <2,2,3,u>: Cost 1 vzipr LHS, LHS + 2551644344U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, <0,2,2,4> + 3171672065U, // <2,2,4,1>: Cost 3 ins <2,u,4,1>, lane 1 + 2094252034U, // <2,2,4,2>: Cost 2 ins <2,2,u,2>, lane 2 + 2094260226U, // <2,2,4,3>: Cost 2 ins <2,2,u,3>, lane 2 + 2020819866U, // <2,2,4,4>: Cost 2 vtrnr <1,2,3,4>, <1,2,3,4> + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 1691159862U, // <2,2,4,6>: Cost 2 vuzpl <2,2,2,2>, RHS + 3171721217U, // <2,2,4,7>: Cost 3 ins <2,u,4,7>, lane 1 + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3167821827U, // <2,2,5,0>: Cost 3 ins <2,2,5,u>, lane 3 + 2670497488U, // <2,2,5,1>: Cost 3 vext2 <u,u,2,2>, <5,1,7,3> + 2094252034U, // <2,2,5,2>: Cost 2 ins <2,2,u,2>, lane 2 + 2094260226U, // <2,2,5,3>: Cost 2 ins <2,2,u,3>, lane 2 + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 1879900264U, // <2,2,5,5>: Cost 2 vzipr <0,1,2,5>, <0,1,2,5> + 2670497890U, // <2,2,5,6>: Cost 3 vext2 <u,u,2,2>, <5,6,7,0> + 1746177334U, // <2,2,5,7>: Cost 2 vuzpr <0,2,0,2>, RHS + 1746177335U, // <2,2,5,u>: Cost 2 vuzpr <0,2,0,2>, RHS + 3088679830U, // <2,2,6,0>: Cost 3 vtrnr <0,2,4,6>, <1,2,3,0> + 3171819521U, // <2,2,6,1>: Cost 3 ins <2,u,6,1>, lane 1 + 2094252034U, // <2,2,6,2>: Cost 2 ins <2,2,u,2>, lane 2 + 1881899110U, // <2,2,6,3>: Cost 2 vzipr <0,4,2,6>, LHS + 3088679078U, // <2,2,6,4>: Cost 3 vtrnr <0,2,4,6>, <0,2,0,4> + 3171852289U, // <2,2,6,5>: Cost 3 ins <2,u,6,5>, lane 1 + 2014937292U, // <2,2,6,6>: Cost 2 vtrnr <0,2,4,6>, <0,2,4,6> + 2094301189U, // <2,2,6,7>: Cost 2 ins <2,2,u,u>, lane 5 + 1881899115U, // <2,2,6,u>: Cost 2 vzipr <0,4,2,6>, LHS + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 2867696462U, // <2,2,7,1>: Cost 3 vuzpr <u,2,0,2>, <6,7,0,1> + 2094252034U, // <2,2,7,2>: Cost 2 ins <2,2,u,2>, lane 2 + 2130018304U, // <2,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0 + 2670499174U, // <2,2,7,4>: Cost 3 vext2 <u,u,2,2>, <7,4,5,6> + 2228291208U, // <2,2,7,5>: Cost 3 vrev <2,2,5,7> + 3203784704U, // <2,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0 + 1879916650U, // <2,2,7,7>: Cost 2 vzipr <0,1,2,7>, <0,1,2,7> + 2130018304U, // <2,2,7,u>: Cost 2 ins <u,2,7,3>, lane 0 + 2020787094U, // <2,2,u,0>: Cost 2 vtrnr <1,2,3,0>, <1,2,3,0> + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 806183014U, // <2,2,u,3>: Cost 1 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 1879925084U, // <2,2,u,6>: Cost 2 vzipr LHS, <0,4,2,6> + 1746177577U, // <2,2,u,7>: Cost 2 vuzpr <0,2,0,2>, RHS + 806183019U, // <2,2,u,u>: Cost 1 vzipr LHS, LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2094374915U, // <2,3,0,3>: Cost 2 ins <2,3,0,u>, lane 3 + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2094940162U, // <2,3,0,5>: Cost 2 ins <2,3,u,5>, lane 2 + 2094374915U, // <2,3,0,6>: Cost 2 ins <2,3,0,u>, lane 3 + 2094374915U, // <2,3,0,7>: Cost 2 ins <2,3,0,u>, lane 3 + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2094956546U, // <2,3,1,7>: Cost 2 ins <2,3,u,7>, lane 2 + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2094522371U, // <2,3,2,0>: Cost 2 ins <2,3,2,u>, lane 3 + 2094907394U, // <2,3,2,1>: Cost 2 ins <2,3,u,1>, lane 2 + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1059889156U, // <2,3,2,3>: Cost 1 ins LHS, lane 4 + 2094522371U, // <2,3,2,4>: Cost 2 ins <2,3,2,u>, lane 3 + 2094940162U, // <2,3,2,5>: Cost 2 ins <2,3,u,5>, lane 2 + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2094956546U, // <2,3,2,7>: Cost 2 ins <2,3,u,7>, lane 2 + 1059889156U, // <2,3,2,u>: Cost 1 ins LHS, lane 4 + 1879884694U, // <2,3,3,0>: Cost 2 vzipr LHS, <1,2,3,0> + 2094907394U, // <2,3,3,1>: Cost 2 ins <2,3,u,1>, lane 2 + 1879884534U, // <2,3,3,2>: Cost 2 vzipr LHS, <1,0,3,2> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1879884698U, // <2,3,3,4>: Cost 2 vzipr LHS, <1,2,3,4> + 2094940162U, // <2,3,3,5>: Cost 2 ins <2,3,u,5>, lane 2 + 2953627415U, // <2,3,3,6>: Cost 3 vzipr LHS, <2,4,3,6> + 1884529808U, // <2,3,3,7>: Cost 2 vzipr LHS, <1,5,3,7> + 1879884702U, // <2,3,3,u>: Cost 2 vzipr LHS, <1,2,3,u> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2094669827U, // <2,3,4,2>: Cost 2 ins <2,3,4,u>, lane 3 + 2094669827U, // <2,3,4,3>: Cost 2 ins <2,3,4,u>, lane 3 + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1691241782U, // <2,3,4,6>: Cost 2 vuzpl <2,2,3,3>, RHS + 2094669827U, // <2,3,4,7>: Cost 2 ins <2,3,4,u>, lane 3 + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726274U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, <0,2,3,5> + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860843U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,1,3> + 2094923778U, // <2,3,5,3>: Cost 2 ins <2,3,u,3>, lane 2 + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1758350646U, // <2,3,5,7>: Cost 2 vuzpr <2,2,3,3>, RHS + 1758350647U, // <2,3,5,u>: Cost 2 vuzpr <2,2,3,3>, RHS + 2094817283U, // <2,3,6,0>: Cost 2 ins <2,3,6,u>, lane 3 + 2094907394U, // <2,3,6,1>: Cost 2 ins <2,3,u,1>, lane 2 + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2094923778U, // <2,3,6,3>: Cost 2 ins <2,3,u,3>, lane 2 + 2094817283U, // <2,3,6,4>: Cost 2 ins <2,3,6,u>, lane 3 + 2094940162U, // <2,3,6,5>: Cost 2 ins <2,3,u,5>, lane 2 + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1060216836U, // <2,3,6,7>: Cost 1 ins RHS, lane 4 + 1060216836U, // <2,3,6,u>: Cost 1 ins RHS, lane 4 + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2094907394U, // <2,3,7,1>: Cost 2 ins <2,3,u,1>, lane 2 + 2974892790U, // <2,3,7,2>: Cost 3 vzipr <3,6,2,7>, <1,0,3,2> + 2133999620U, // <2,3,7,3>: Cost 2 ins <u,u,7,3>, lane 4 + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2094940162U, // <2,3,7,5>: Cost 2 ins <2,3,u,5>, lane 2 + 2134024196U, // <2,3,7,6>: Cost 2 ins <u,u,7,6>, lane 4 + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1879925654U, // <2,3,u,0>: Cost 2 vzipr LHS, <1,2,3,0> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1879925494U, // <2,3,u,2>: Cost 2 vzipr LHS, <1,0,3,2> + 1059889156U, // <2,3,u,3>: Cost 1 ins LHS, lane 4 + 1879925658U, // <2,3,u,4>: Cost 2 vzipr LHS, <1,2,3,4> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1060216836U, // <2,3,u,7>: Cost 1 ins RHS, lane 4 + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2826125312U, // <2,4,0,0>: Cost 3 vuzpr <1,2,3,4>, <0,0,0,0> + 2097635329U, // <2,4,0,1>: Cost 2 ins <2,u,0,1>, lane 1 + 1691992166U, // <2,4,0,2>: Cost 2 vuzpl <2,3,4,5>, LHS + 3171393537U, // <2,4,0,3>: Cost 3 ins <2,u,0,3>, lane 1 + 2765734092U, // <2,4,0,4>: Cost 3 vuzpl <2,3,4,5>, <0,2,4,6> + 3094528338U, // <2,4,0,5>: Cost 3 vtrnr <1,2,3,0>, <0,4,1,5> + 1960103222U, // <2,4,0,6>: Cost 2 vtrnl <2,3,0,1>, RHS + 3171426305U, // <2,4,0,7>: Cost 3 ins <2,u,0,7>, lane 1 + 1960103240U, // <2,4,0,u>: Cost 2 vtrnl <2,3,0,1>, RHS + 3204620288U, // <2,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0 + 2826126132U, // <2,4,1,1>: Cost 3 vuzpr <1,2,3,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 1752383590U, // <2,4,1,3>: Cost 2 vuzpr <1,2,3,4>, LHS + 3204653056U, // <2,4,1,4>: Cost 3 ins <u,4,1,4>, lane 0 + 2130919424U, // <2,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0 + 3031936310U, // <2,4,1,6>: Cost 3 vtrnl <2,0,1,2>, RHS + 3169361922U, // <2,4,1,7>: Cost 3 ins <2,4,u,7>, lane 2 + 1752383595U, // <2,4,1,u>: Cost 2 vuzpr <1,2,3,4>, LHS + 2826126230U, // <2,4,2,0>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,0> + 3171524609U, // <2,4,2,1>: Cost 3 ins <2,u,2,1>, lane 1 + 2097790977U, // <2,4,2,2>: Cost 2 ins <2,u,2,2>, lane 1 + 2130976768U, // <2,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0 + 1752384410U, // <2,4,2,4>: Cost 2 vuzpr <1,2,3,4>, <1,2,3,4> + 1825377590U, // <2,4,2,5>: Cost 2 vzipl <2,2,2,2>, RHS + 1959595318U, // <2,4,2,6>: Cost 2 vtrnl <2,2,2,2>, RHS + 3171573761U, // <2,4,2,7>: Cost 3 ins <2,u,2,7>, lane 1 + 1825377833U, // <2,4,2,u>: Cost 2 vzipl <2,2,2,2>, RHS + 2826127049U, // <2,4,3,0>: Cost 3 vuzpr <1,2,3,4>, <2,3,4,0> + 2958270501U, // <2,4,3,1>: Cost 3 vzipr LHS, <0,0,4,1> + 2958270502U, // <2,4,3,2>: Cost 3 vzipr LHS, <0,0,4,2> + 2097872897U, // <2,4,3,3>: Cost 2 ins <2,u,3,3>, lane 1 + 1927662800U, // <2,4,3,4>: Cost 2 vzipr LHS, <4,4,4,4> + 1879885518U, // <2,4,3,5>: Cost 2 vzipr LHS, <2,3,4,5> + 1879883980U, // <2,4,3,6>: Cost 2 vzipr LHS, <0,2,4,6> + 2097905665U, // <2,4,3,7>: Cost 2 ins <2,u,3,7>, lane 1 + 1879883982U, // <2,4,3,u>: Cost 2 vzipr LHS, <0,2,4,u> + 2563735654U, // <2,4,4,0>: Cost 3 vext1 <2,2,4,4>, LHS + 2826127824U, // <2,4,4,1>: Cost 3 vuzpr <1,2,3,4>, <3,4,0,1> + 2826127834U, // <2,4,4,2>: Cost 3 vuzpr <1,2,3,4>, <3,4,1,2> + 2826127106U, // <2,4,4,3>: Cost 3 vuzpr <1,2,3,4>, <2,4,1,3> + 2131132416U, // <2,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0 + 2097963009U, // <2,4,4,5>: Cost 2 ins <2,u,4,5>, lane 1 + 1691995446U, // <2,4,4,6>: Cost 2 vuzpl <2,3,4,5>, RHS + 3094562602U, // <2,4,4,7>: Cost 3 vtrnr <1,2,3,4>, <2,4,5,7> + 1691995464U, // <2,4,4,u>: Cost 2 vuzpl <2,3,4,5>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2765737726U, // <2,4,5,3>: Cost 3 vuzpl <2,3,4,5>, <5,2,3,4> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 2131214336U, // <2,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0 + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1752386870U, // <2,4,5,7>: Cost 2 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066380U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, <0,2,4,6> + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 1828146486U, // <2,4,6,5>: Cost 2 vzipl <2,6,3,7>, RHS + 2131296256U, // <2,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0 + 2131304448U, // <2,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0 + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 2867934030U, // <2,4,7,1>: Cost 3 vuzpr <u,2,3,4>, <6,7,0,1> + 3169320962U, // <2,4,7,2>: Cost 3 ins <2,4,u,2>, lane 2 + 2867933312U, // <2,4,7,3>: Cost 3 vuzpr <u,2,3,4>, <5,7,1,3> + 3205095424U, // <2,4,7,4>: Cost 3 ins <u,4,7,4>, lane 0 + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2131369984U, // <2,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 2867933352U, // <2,4,7,7>: Cost 3 vuzpr <u,2,3,4>, <5,7,5,7> + 2131369984U, // <2,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0 + 1478082766U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, <0,2,4,u> + 2097635329U, // <2,4,u,1>: Cost 2 ins <2,u,0,1>, lane 1 + 1691997998U, // <2,4,u,2>: Cost 2 vuzpl <2,3,4,5>, LHS + 1752384157U, // <2,4,u,3>: Cost 2 vuzpr <1,2,3,4>, LHS + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 1879926478U, // <2,4,u,5>: Cost 2 vzipr LHS, <2,3,4,5> + 1879924940U, // <2,4,u,6>: Cost 2 vzipr LHS, <0,2,4,6> + 1752387113U, // <2,4,u,7>: Cost 2 vuzpr <1,2,3,4>, RHS + 1879924942U, // <2,4,u,u>: Cost 2 vzipr LHS, <0,2,4,u> + 2765160612U, // <2,5,0,0>: Cost 3 vuzpl <2,2,5,7>, <0,2,0,2> + 2097635329U, // <2,5,0,1>: Cost 2 ins <2,u,0,1>, lane 1 + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 3136335876U, // <2,5,0,5>: Cost 3 vtrnr <u,2,3,0>, <5,5,5,5> + 3171418113U, // <2,5,0,6>: Cost 3 ins <2,u,0,6>, lane 1 + 2020789558U, // <2,5,0,7>: Cost 2 vtrnr <1,2,3,0>, RHS + 2020789559U, // <2,5,0,u>: Cost 2 vtrnr <1,2,3,0>, RHS + 2599616614U, // <2,5,1,0>: Cost 3 vext1 <u,2,5,1>, LHS + 3205292032U, // <2,5,1,1>: Cost 3 ins <u,5,1,1>, lane 0 + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 2599620736U, // <2,5,1,5>: Cost 3 vext1 <u,2,5,1>, <5,7,1,3> + 3205332992U, // <2,5,1,6>: Cost 3 ins <u,5,1,6>, lane 0 + 2131599360U, // <2,5,1,7>: Cost 2 ins <u,5,1,7>, lane 0 + 2131599360U, // <2,5,1,u>: Cost 2 ins <u,5,1,7>, lane 0 + 3171516417U, // <2,5,2,0>: Cost 3 ins <2,u,2,0>, lane 1 + 3006040978U, // <2,5,2,1>: Cost 3 vzipr <u,u,2,2>, <4,0,5,1> + 2097790977U, // <2,5,2,2>: Cost 2 ins <2,u,2,2>, lane 1 + 2131640320U, // <2,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0 + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 2820014256U, // <2,5,2,5>: Cost 3 vuzpr <0,2,1,5>, <0,2,1,5> + 2958264834U, // <2,5,2,6>: Cost 3 vzipr <0,u,2,2>, <3,4,5,6> + 2014612790U, // <2,5,2,7>: Cost 2 vtrnr <0,2,0,2>, RHS + 2014612791U, // <2,5,2,u>: Cost 2 vtrnr <0,2,0,2>, RHS + 2958273506U, // <2,5,3,0>: Cost 3 vzipr LHS, <4,1,5,0> + 1927662482U, // <2,5,3,1>: Cost 2 vzipr LHS, <4,0,5,1> + 2899955454U, // <2,5,3,2>: Cost 3 vzipl <2,3,4,5>, <5,2,3,4> + 2097872897U, // <2,5,3,3>: Cost 2 ins <2,u,3,3>, lane 1 + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 1927662810U, // <2,5,3,5>: Cost 2 vzipr LHS, <4,4,5,5> + 1879886338U, // <2,5,3,6>: Cost 2 vzipr LHS, <3,4,5,6> + 1879884800U, // <2,5,3,7>: Cost 2 vzipr LHS, <1,3,5,7> + 1879884801U, // <2,5,3,u>: Cost 2 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3171672065U, // <2,5,4,1>: Cost 3 ins <2,u,4,1>, lane 1 + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 3034173182U, // <2,5,4,3>: Cost 3 vtrnl <2,3,4,5>, <5,2,3,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2097963009U, // <2,5,4,5>: Cost 2 ins <2,u,4,5>, lane 1 + 2820164098U, // <2,5,4,6>: Cost 3 vuzpr <0,2,3,5>, <3,4,5,6> + 2020822326U, // <2,5,4,7>: Cost 2 vtrnr <1,2,3,4>, RHS + 2020822327U, // <2,5,4,u>: Cost 2 vtrnr <1,2,3,4>, RHS + 2599649382U, // <2,5,5,0>: Cost 3 vext1 <u,2,5,5>, LHS + 3003411346U, // <2,5,5,1>: Cost 3 vzipr <u,4,2,5>, <4,0,5,1> + 2563819142U, // <2,5,5,2>: Cost 3 vext1 <2,2,5,5>, <2,2,5,5> + 2953642113U, // <2,5,5,3>: Cost 3 vzipr <0,1,2,5>, <0,1,5,3> + 2599652662U, // <2,5,5,4>: Cost 3 vext1 <u,2,5,5>, RHS + 2131877888U, // <2,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0 + 2954971650U, // <2,5,5,6>: Cost 3 vzipr <0,3,2,5>, <3,4,5,6> + 2131894272U, // <2,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0 + 2131877888U, // <2,5,5,u>: Cost 2 ins <u,5,5,5>, lane 0 + 2131910656U, // <2,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0 + 2131918848U, // <2,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2131927040U, // <2,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2131935232U, // <2,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0 + 2131943424U, // <2,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0 + 2131951616U, // <2,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 2131959808U, // <2,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0 + 1058226176U, // <2,5,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <2,5,6,u>: Cost 1 ins RHS, lane 0 + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2712244352U, // <2,5,7,1>: Cost 3 vext3 <4,6,0,2>, <5,7,1,3> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 2953658497U, // <2,5,7,3>: Cost 3 vzipr <0,1,2,7>, <0,1,5,3> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712244392U, // <2,5,7,5>: Cost 3 vext3 <4,6,0,2>, <5,7,5,7> + 2712244396U, // <2,5,7,6>: Cost 3 vext3 <4,6,0,2>, <5,7,6,2> + 2132041728U, // <2,5,7,7>: Cost 2 ins <u,5,7,7>, lane 0 + 2132041728U, // <2,5,7,u>: Cost 2 ins <u,5,7,7>, lane 0 + 2131910656U, // <2,5,u,0>: Cost 2 ins <u,5,6,0>, lane 0 + 1927703442U, // <2,5,u,1>: Cost 2 vzipr LHS, <4,0,5,1> + 2097790977U, // <2,5,u,2>: Cost 2 ins <2,u,2,2>, lane 1 + 2097872897U, // <2,5,u,3>: Cost 2 ins <2,u,3,3>, lane 1 + 2131943424U, // <2,5,u,4>: Cost 2 ins <u,5,6,4>, lane 0 + 1927703770U, // <2,5,u,5>: Cost 2 vzipr LHS, <4,4,5,5> + 1879927298U, // <2,5,u,6>: Cost 2 vzipr LHS, <3,4,5,6> + 1058226176U, // <2,5,u,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <2,5,u,u>: Cost 1 ins RHS, lane 0 + 2820243456U, // <2,6,0,0>: Cost 3 vuzpr <0,2,4,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2132148224U, // <2,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0 + 3171393537U, // <2,6,0,3>: Cost 3 ins <2,u,0,3>, lane 1 + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3170672642U, // <2,6,0,5>: Cost 3 ins <2,6,u,5>, lane 2 + 3136335220U, // <2,6,0,6>: Cost 3 vtrnr <u,2,3,0>, <4,6,4,6> + 2096947202U, // <2,6,0,7>: Cost 2 ins <2,6,u,7>, lane 2 + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2820244276U, // <2,6,1,1>: Cost 3 vuzpr <0,2,4,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 1746501734U, // <2,6,1,3>: Cost 2 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3205996544U, // <2,6,1,6>: Cost 3 ins <u,6,1,6>, lane 0 + 2096947202U, // <2,6,1,7>: Cost 2 ins <2,6,u,7>, lane 2 + 1746501739U, // <2,6,1,u>: Cost 2 vuzpr <0,2,4,6>, LHS + 2820244374U, // <2,6,2,0>: Cost 3 vuzpr <0,2,4,6>, <1,2,3,0> + 3171524609U, // <2,6,2,1>: Cost 3 ins <2,u,2,1>, lane 1 + 2097790977U, // <2,6,2,2>: Cost 2 ins <2,u,2,2>, lane 1 + 2096955397U, // <2,6,2,3>: Cost 2 ins <2,6,u,u>, lane 5 + 2820243622U, // <2,6,2,4>: Cost 3 vuzpr <0,2,4,6>, <0,2,0,4> + 3171557377U, // <2,6,2,5>: Cost 3 ins <2,u,2,5>, lane 1 + 1746501836U, // <2,6,2,6>: Cost 2 vuzpr <0,2,4,6>, <0,2,4,6> + 1884523830U, // <2,6,2,7>: Cost 2 vzipr <0,u,2,2>, RHS + 1884523831U, // <2,6,2,u>: Cost 2 vzipr <0,u,2,2>, RHS + 2096586755U, // <2,6,3,0>: Cost 2 ins <2,6,3,u>, lane 3 + 2096586755U, // <2,6,3,1>: Cost 2 ins <2,6,3,u>, lane 3 + 1927662492U, // <2,6,3,2>: Cost 2 vzipr LHS, <4,0,6,2> + 2097872897U, // <2,6,3,3>: Cost 2 ins <2,u,3,3>, lane 1 + 2096586755U, // <2,6,3,4>: Cost 2 ins <2,6,3,u>, lane 3 + 2096586755U, // <2,6,3,5>: Cost 2 ins <2,6,3,u>, lane 3 + 1927662820U, // <2,6,3,6>: Cost 2 vzipr LHS, <4,4,6,6> + 806145334U, // <2,6,3,7>: Cost 1 vzipr LHS, RHS + 806145335U, // <2,6,3,u>: Cost 1 vzipr LHS, RHS + 2820245292U, // <2,6,4,0>: Cost 3 vuzpr <0,2,4,6>, <2,4,6,0> + 3171672065U, // <2,6,4,1>: Cost 3 ins <2,u,4,1>, lane 1 + 2820243782U, // <2,6,4,2>: Cost 3 vuzpr <0,2,4,6>, <0,4,0,2> + 3171688449U, // <2,6,4,3>: Cost 3 ins <2,u,4,3>, lane 1 + 2820243784U, // <2,6,4,4>: Cost 3 vuzpr <0,2,4,6>, <0,4,0,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2132475904U, // <2,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0 + 2096947202U, // <2,6,4,7>: Cost 2 ins <2,6,u,7>, lane 2 + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3170476035U, // <2,6,5,0>: Cost 3 ins <2,6,5,u>, lane 3 + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 3206258688U, // <2,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0 + 3170656258U, // <2,6,5,3>: Cost 3 ins <2,6,u,3>, lane 2 + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2868023300U, // <2,6,5,5>: Cost 3 vuzpr <u,2,4,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 1746505014U, // <2,6,5,7>: Cost 2 vuzpr <0,2,4,6>, RHS + 1746505015U, // <2,6,5,u>: Cost 2 vuzpr <0,2,4,6>, RHS + 2955643964U, // <2,6,6,0>: Cost 3 vzipr <0,4,2,6>, <4,2,6,0> + 2820246859U, // <2,6,6,1>: Cost 3 vuzpr <0,2,4,6>, <4,6,0,1> + 2820246860U, // <2,6,6,2>: Cost 3 vuzpr <0,2,4,6>, <4,6,0,2> + 2820245412U, // <2,6,6,3>: Cost 3 vuzpr <0,2,4,6>, <2,6,1,3> + 2955643968U, // <2,6,6,4>: Cost 3 vzipr <0,4,2,6>, <4,2,6,4> + 2820246899U, // <2,6,6,5>: Cost 3 vuzpr <0,2,4,6>, <4,6,4,5> + 2132623360U, // <2,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0 + 1881902390U, // <2,6,6,7>: Cost 2 vzipr <0,4,2,6>, RHS + 1881902391U, // <2,6,6,u>: Cost 2 vzipr <0,4,2,6>, RHS + 2132647936U, // <2,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0 + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 3124596044U, // <2,6,7,2>: Cost 3 vtrnr <6,2,5,7>, <4,6,0,2> + 2868023424U, // <2,6,7,3>: Cost 3 vuzpr <u,2,4,6>, <5,7,1,3> + 2132680704U, // <2,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0 + 2252181996U, // <2,6,7,5>: Cost 3 vrev <6,2,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2132705280U, // <2,6,7,7>: Cost 2 ins <u,6,7,7>, lane 0 + 2132647936U, // <2,6,7,u>: Cost 2 ins <u,6,7,0>, lane 0 + 2096586755U, // <2,6,u,0>: Cost 2 ins <2,6,3,u>, lane 3 + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 1927703452U, // <2,6,u,2>: Cost 2 vzipr LHS, <4,0,6,2> + 1746502301U, // <2,6,u,3>: Cost 2 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 1927703780U, // <2,6,u,6>: Cost 2 vzipr LHS, <4,4,6,6> + 806186294U, // <2,6,u,7>: Cost 1 vzipr LHS, RHS + 806186295U, // <2,6,u,u>: Cost 1 vzipr LHS, RHS + 2581839974U, // <2,7,0,0>: Cost 3 vext1 <5,2,7,0>, LHS + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2581843254U, // <2,7,0,4>: Cost 3 vext1 <5,2,7,0>, RHS + 2581843742U, // <2,7,0,5>: Cost 3 vext1 <5,2,7,0>, <5,2,7,0> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 3136336040U, // <2,7,0,7>: Cost 3 vtrnr <u,2,3,0>, <5,7,5,7> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3206619136U, // <2,7,1,1>: Cost 3 ins <u,7,1,1>, lane 0 + 3206627328U, // <2,7,1,2>: Cost 3 ins <u,7,1,2>, lane 0 + 2132893696U, // <2,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0 + 2599767350U, // <2,7,1,4>: Cost 3 vext1 <u,2,7,1>, RHS + 3206651904U, // <2,7,1,5>: Cost 3 ins <u,7,1,5>, lane 0 + 3171344386U, // <2,7,1,6>: Cost 3 ins <2,7,u,6>, lane 2 + 2599769082U, // <2,7,1,7>: Cost 3 vext1 <u,2,7,1>, <7,0,1,2> + 2132893696U, // <2,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0 + 2581856358U, // <2,7,2,0>: Cost 3 vext1 <5,2,7,2>, LHS + 3136131918U, // <2,7,2,1>: Cost 3 vtrnr <u,2,0,2>, <6,7,0,1> + 2097790977U, // <2,7,2,2>: Cost 2 ins <2,u,2,2>, lane 1 + 2132967424U, // <2,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0 + 2581859638U, // <2,7,2,4>: Cost 3 vext1 <5,2,7,2>, RHS + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 1770548291U, // <2,7,2,7>: Cost 2 vuzpr <4,2,6,7>, <4,2,6,7> + 2097790977U, // <2,7,2,u>: Cost 2 ins <2,u,2,2>, lane 1 + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 1927663312U, // <2,7,3,3>: Cost 2 vzipr LHS, <5,1,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 1927663640U, // <2,7,3,7>: Cost 2 vzipr LHS, <5,5,7,7> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2581872742U, // <2,7,4,0>: Cost 3 vext1 <5,2,7,4>, LHS + 2581873562U, // <2,7,4,1>: Cost 3 vext1 <5,2,7,4>, <1,2,3,4> + 3171680257U, // <2,7,4,2>: Cost 3 ins <2,u,4,2>, lane 1 + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 2581876022U, // <2,7,4,4>: Cost 3 vext1 <5,2,7,4>, RHS + 2133131264U, // <2,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0 + 2712245609U, // <2,7,4,6>: Cost 3 vext3 <4,6,0,2>, <7,4,6,0> + 3136368808U, // <2,7,4,7>: Cost 3 vtrnr <u,2,3,4>, <5,7,5,7> + 2133131264U, // <2,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0 + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3206914048U, // <2,7,5,1>: Cost 3 ins <u,7,5,1>, lane 0 + 2844290353U, // <2,7,5,2>: Cost 3 vuzpr <4,2,6,7>, <4,5,6,2> + 2991469050U, // <2,7,5,3>: Cost 3 vzipr <6,4,2,5>, <6,2,7,3> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3206946816U, // <2,7,5,5>: Cost 3 ins <u,7,5,5>, lane 0 + 3206955008U, // <2,7,5,6>: Cost 3 ins <u,7,5,6>, lane 0 + 2133221376U, // <2,7,5,7>: Cost 2 ins <u,7,5,7>, lane 0 + 2133221376U, // <2,7,5,u>: Cost 2 ins <u,7,5,7>, lane 0 + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3136459598U, // <2,7,6,1>: Cost 3 vtrnr <u,2,4,6>, <6,7,0,1> + 2901890250U, // <2,7,6,2>: Cost 3 vzipl <2,6,3,7>, <7,2,6,3> + 3136458880U, // <2,7,6,3>: Cost 3 vtrnr <u,2,4,6>, <5,7,1,3> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 2133295104U, // <2,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0 + 2133295104U, // <2,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0 + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3207061504U, // <2,7,7,1>: Cost 3 ins <u,7,7,1>, lane 0 + 2563983002U, // <2,7,7,2>: Cost 3 vext1 <2,2,7,7>, <2,2,7,7> + 2998784506U, // <2,7,7,3>: Cost 3 vzipr <7,6,2,7>, <6,2,7,3> + 2599816502U, // <2,7,7,4>: Cost 3 vext1 <u,2,7,7>, RHS + 3207094272U, // <2,7,7,5>: Cost 3 ins <u,7,7,5>, lane 0 + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2133368832U, // <2,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0 + 2133368832U, // <2,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0 + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2097790977U, // <2,7,u,2>: Cost 2 ins <2,u,2,2>, lane 1 + 1927704272U, // <2,7,u,3>: Cost 2 vzipr LHS, <5,1,7,3> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2133131264U, // <2,7,u,5>: Cost 2 ins <u,7,4,5>, lane 0 + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 1927704600U, // <2,7,u,7>: Cost 2 vzipr LHS, <5,5,7,7> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2020786845U, // <2,u,0,3>: Cost 2 vtrnr <1,2,3,0>, LHS + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2094940162U, // <2,u,0,5>: Cost 2 ins <2,3,u,5>, lane 2 + 1960106138U, // <2,u,0,6>: Cost 2 vtrnl <2,3,0,1>, RHS + 2020789801U, // <2,u,0,7>: Cost 2 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2096947202U, // <2,u,1,7>: Cost 2 ins <2,6,u,7>, lane 2 + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328556U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, <0,2,u,2> + 1825380142U, // <2,u,2,1>: Cost 2 vzipl <2,2,2,2>, LHS + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1055244288U, // <2,u,2,3>: Cost 1 ins LHS, lane 0 + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 1825380506U, // <2,u,2,5>: Cost 2 vzipl <2,2,2,2>, RHS + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2014613033U, // <2,u,2,7>: Cost 2 vtrnr <0,2,0,2>, RHS + 1055244288U, // <2,u,2,u>: Cost 1 ins LHS, lane 0 + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1879885550U, // <2,u,3,1>: Cost 2 vzipr LHS, <2,3,u,1> + 1879884012U, // <2,u,3,2>: Cost 2 vzipr LHS, <0,2,u,2> + 806142108U, // <2,u,3,3>: Cost 1 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 1879885554U, // <2,u,3,5>: Cost 2 vzipr LHS, <2,3,u,5> + 1879884016U, // <2,u,3,6>: Cost 2 vzipr LHS, <0,2,u,6> + 806145352U, // <2,u,3,7>: Cost 1 vzipr LHS, RHS + 806142113U, // <2,u,3,u>: Cost 1 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 1960433454U, // <2,u,4,2>: Cost 2 vtrnl <2,3,4,5>, LHS + 2020819613U, // <2,u,4,3>: Cost 2 vtrnr <1,2,3,4>, LHS + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1691610422U, // <2,u,4,6>: Cost 2 vuzpl <2,2,u,3>, RHS + 2020822569U, // <2,u,4,7>: Cost 2 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2094252034U, // <2,u,5,2>: Cost 2 ins <2,2,u,2>, lane 2 + 2094260226U, // <2,u,5,3>: Cost 2 ins <2,2,u,3>, lane 2 + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1746226486U, // <2,u,5,7>: Cost 2 vuzpr <0,2,0,u>, RHS + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361328U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, <0,2,u,6> + 1828149038U, // <2,u,6,1>: Cost 2 vzipl <2,6,3,7>, LHS + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2014937757U, // <2,u,6,3>: Cost 2 vtrnr <0,2,4,6>, LHS + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 1828149402U, // <2,u,6,5>: Cost 2 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1060216836U, // <2,u,6,7>: Cost 1 ins RHS, lane 4 + 1060216836U, // <2,u,6,u>: Cost 1 ins RHS, lane 4 + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2094907394U, // <2,u,7,1>: Cost 2 ins <2,3,u,1>, lane 2 + 2094252034U, // <2,u,7,2>: Cost 2 ins <2,2,u,2>, lane 2 + 2129354752U, // <2,u,7,3>: Cost 2 ins <u,1,7,3>, lane 0 + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2094940162U, // <2,u,7,5>: Cost 2 ins <2,3,u,5>, lane 2 + 2134024196U, // <2,u,7,6>: Cost 2 ins <u,u,7,6>, lane 4 + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1879925699U, // <2,u,u,0>: Cost 2 vzipr LHS, <1,2,u,0> + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 806183068U, // <2,u,u,3>: Cost 1 vzipr LHS, LHS + 1879925703U, // <2,u,u,4>: Cost 2 vzipr LHS, <1,2,u,4> + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1879924976U, // <2,u,u,6>: Cost 2 vzipr LHS, <0,2,u,6> + 806186312U, // <2,u,u,7>: Cost 1 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2960312624U, // <3,0,0,3>: Cost 3 vzipr <1,2,3,0>, <3,2,0,3> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3177381889U, // <3,0,0,5>: Cost 3 ins <3,u,0,5>, lane 1 + 3177390081U, // <3,0,0,6>: Cost 3 ins <3,u,0,6>, lane 1 + 3177398273U, // <3,0,0,7>: Cost 3 ins <3,u,0,7>, lane 1 + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2128232448U, // <3,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0 + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2098429955U, // <3,0,1,3>: Cost 2 ins <3,0,1,u>, lane 3 + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2098429955U, // <3,0,1,5>: Cost 2 ins <3,0,1,u>, lane 3 + 2098429955U, // <3,0,1,6>: Cost 2 ins <3,0,1,u>, lane 3 + 2098429955U, // <3,0,1,7>: Cost 2 ins <3,0,1,u>, lane 3 + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2128314368U, // <3,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0 + 2098946053U, // <3,0,2,3>: Cost 2 ins <3,0,u,u>, lane 5 + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2959000610U, // <3,0,2,5>: Cost 3 vzipr <1,0,3,2>, <1,4,0,5> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 3177545729U, // <3,0,2,7>: Cost 3 ins <3,u,2,7>, lane 1 + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2820636924U, // <3,0,3,0>: Cost 3 vuzpr <0,3,1,0>, <0,3,1,0> + 1832091750U, // <3,0,3,1>: Cost 2 vzipl <3,3,3,3>, LHS + 1966309478U, // <3,0,3,2>: Cost 2 vtrnl <3,3,3,3>, LHS + 2103844865U, // <3,0,3,3>: Cost 2 ins <3,u,3,3>, lane 1 + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 2772716034U, // <3,0,3,5>: Cost 3 vuzpl <3,5,0,2>, <3,4,5,6> + 3177611265U, // <3,0,3,6>: Cost 3 ins <3,u,3,6>, lane 1 + 3177619457U, // <3,0,3,7>: Cost 3 ins <3,u,3,7>, lane 1 + 1832092317U, // <3,0,3,u>: Cost 2 vzipl <3,3,3,3>, LHS + 2689835334U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,2> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 2906669312U, // <3,0,4,3>: Cost 3 vzipl <3,4,5,6>, <0,3,1,4> + 2689835373U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,5> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2769382710U, // <3,0,4,6>: Cost 3 vuzpl <3,0,0,0>, RHS + 3177693185U, // <3,0,4,7>: Cost 3 ins <3,u,4,7>, lane 1 + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 3101278208U, // <3,0,5,0>: Cost 3 vtrnr <2,3,4,5>, <0,0,0,0> + 2128527360U, // <3,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0 + 1967145062U, // <3,0,5,2>: Cost 2 vtrnl <3,4,5,6>, LHS + 3040886978U, // <3,0,5,3>: Cost 3 vtrnl <3,4,5,6>, <0,2,3,5> + 3040886988U, // <3,0,5,4>: Cost 3 vtrnl <3,4,5,6>, <0,2,4,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2104016897U, // <3,0,5,6>: Cost 2 ins <3,u,5,6>, lane 1 + 2820640054U, // <3,0,5,7>: Cost 3 vuzpr <0,3,1,0>, RHS + 1967145116U, // <3,0,5,u>: Cost 2 vtrnl <3,4,5,6>, LHS + 3202334720U, // <3,0,6,0>: Cost 3 ins <u,0,6,0>, lane 0 + 2907635814U, // <3,0,6,1>: Cost 3 vzipl <3,6,0,7>, LHS + 2128609280U, // <3,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 3177807873U, // <3,0,6,3>: Cost 3 ins <3,u,6,3>, lane 1 + 3202367488U, // <3,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0 + 3172663298U, // <3,0,6,5>: Cost 3 ins <3,0,u,5>, lane 2 + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2098946053U, // <3,0,6,7>: Cost 2 ins <3,0,u,u>, lane 5 + 2128609280U, // <3,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0 + 3095396352U, // <3,0,7,0>: Cost 3 vtrnr <1,3,5,7>, <0,0,0,0> + 3095396362U, // <3,0,7,1>: Cost 3 vtrnr <1,3,5,7>, <0,0,1,1> + 2098896898U, // <3,0,7,2>: Cost 2 ins <3,0,u,2>, lane 2 + 3177881601U, // <3,0,7,3>: Cost 3 ins <3,u,7,3>, lane 1 + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3177897985U, // <3,0,7,5>: Cost 3 ins <3,u,7,5>, lane 1 + 3202457600U, // <3,0,7,6>: Cost 3 ins <u,0,7,6>, lane 0 + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2098896898U, // <3,0,7,u>: Cost 2 ins <3,0,u,2>, lane 2 + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2098429955U, // <3,0,u,3>: Cost 2 ins <3,0,1,u>, lane 3 + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2098429955U, // <3,0,u,6>: Cost 2 ins <3,0,1,u>, lane 3 + 2098429955U, // <3,0,u,7>: Cost 2 ins <3,0,1,u>, lane 3 + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201468U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, <0,3,1,0> + 2128822272U, // <3,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0 + 1695727718U, // <3,1,0,2>: Cost 2 vuzpl <3,0,1,2>, LHS + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2960310610U, // <3,1,0,5>: Cost 3 vzipr <1,2,3,0>, <0,4,1,5> + 2832516572U, // <3,1,0,6>: Cost 3 vuzpr <2,3,0,1>, <2,0,4,6> + 3177398273U, // <3,1,0,7>: Cost 3 ins <3,u,0,7>, lane 1 + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2103689217U, // <3,1,1,2>: Cost 2 ins <3,u,1,2>, lane 1 + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3177463809U, // <3,1,1,6>: Cost 3 ins <3,u,1,6>, lane 1 + 3100952848U, // <3,1,1,7>: Cost 3 vtrnr <2,3,0,1>, <3,1,5,7> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2128961536U, // <3,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0 + 2128969728U, // <3,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 2128977920U, // <3,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0 + 1055244288U, // <3,1,2,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <3,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0 + 2129002496U, // <3,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2129010688U, // <3,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2129018880U, // <3,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0 + 1055244288U, // <3,1,2,u>: Cost 1 ins LHS, lane 0 + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2021326950U, // <3,1,3,3>: Cost 2 vtrnr <1,3,1,3>, LHS + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2832516096U, // <3,1,3,7>: Cost 3 vuzpr <2,3,0,1>, <1,3,5,7> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234240U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, <0,3,1,4> + 2960343050U, // <3,1,4,1>: Cost 3 vzipr <1,2,3,4>, <0,0,1,1> + 2960345238U, // <3,1,4,2>: Cost 3 vzipr <1,2,3,4>, <3,0,1,2> + 2129133568U, // <3,1,4,3>: Cost 2 ins <u,1,4,3>, lane 0 + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2129149952U, // <3,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0 + 1695730998U, // <3,1,4,6>: Cost 2 vuzpl <3,0,1,2>, RHS + 3177693185U, // <3,1,4,7>: Cost 3 ins <3,u,4,7>, lane 1 + 1695731016U, // <3,1,4,u>: Cost 2 vuzpl <3,0,1,2>, RHS + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 2961678674U, // <3,1,5,5>: Cost 3 vzipr <1,4,3,5>, <0,4,1,5> + 2104016897U, // <3,1,5,6>: Cost 2 ins <3,u,5,6>, lane 1 + 1758776630U, // <3,1,5,7>: Cost 2 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2907783926U, // <3,1,6,0>: Cost 3 vzipl <3,6,2,7>, <1,0,3,2> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2222752740U, // <3,1,6,2>: Cost 3 vrev <1,3,2,6> + 2129281024U, // <3,1,6,3>: Cost 2 ins <u,1,6,3>, lane 0 + 2222900214U, // <3,1,6,4>: Cost 3 vrev <1,3,4,6> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2868350324U, // <3,1,6,6>: Cost 3 vuzpr <u,3,0,1>, <4,6,4,6> + 2129313792U, // <3,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0 + 2129281024U, // <3,1,6,u>: Cost 2 ins <u,1,6,3>, lane 0 + 3177857025U, // <3,1,7,0>: Cost 3 ins <3,u,7,0>, lane 1 + 3095397172U, // <3,1,7,1>: Cost 3 vtrnr <1,3,5,7>, <1,1,1,1> + 2962360470U, // <3,1,7,2>: Cost 3 vzipr <1,5,3,7>, <3,0,1,2> + 2021654630U, // <3,1,7,3>: Cost 2 vtrnr <1,3,5,7>, LHS + 3177889793U, // <3,1,7,4>: Cost 3 ins <3,u,7,4>, lane 1 + 1149240320U, // <3,1,7,5>: Cost 2 vrev <1,3,5,7> + 2223055881U, // <3,1,7,6>: Cost 3 vrev <1,3,6,7> + 2868351144U, // <3,1,7,7>: Cost 3 vuzpr <u,3,0,1>, <5,7,5,7> + 2021654635U, // <3,1,7,u>: Cost 2 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 1695733550U, // <3,1,u,2>: Cost 2 vuzpl <3,0,1,2>, LHS + 1055244288U, // <3,1,u,3>: Cost 1 ins LHS, lane 0 + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 1695733914U, // <3,1,u,6>: Cost 2 vuzpl <3,0,1,2>, RHS + 1758776873U, // <3,1,u,7>: Cost 2 vuzpr <2,3,0,1>, RHS + 1055244288U, // <3,1,u,u>: Cost 1 ins LHS, lane 0 + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2129494016U, // <3,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0 + 1886568550U, // <3,2,0,3>: Cost 2 vzipr <1,2,3,0>, LHS + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2960311348U, // <3,2,0,5>: Cost 3 vzipr <1,2,3,0>, <1,4,2,5> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 3177398273U, // <3,2,0,7>: Cost 3 ins <3,u,0,7>, lane 1 + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2103689217U, // <3,2,1,2>: Cost 2 ins <3,u,1,2>, lane 1 + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3177472001U, // <3,2,1,7>: Cost 3 ins <3,u,1,7>, lane 1 + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836685U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,3> + 3177545729U, // <3,2,2,7>: Cost 3 ins <3,u,2,7>, lane 1 + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 1611450042U, // <3,2,3,2>: Cost 2 vext3 LHS, <2,3,2,3> + 1885929574U, // <3,2,3,3>: Cost 2 vzipr <1,1,3,3>, LHS + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 1611450082U, // <3,2,3,6>: Cost 2 vext3 LHS, <2,3,6,7> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280674U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,3,2,4> + 2960343060U, // <3,2,4,2>: Cost 3 vzipr <1,2,3,4>, <0,0,2,2> + 1886601318U, // <3,2,4,3>: Cost 2 vzipr <1,2,3,4>, LHS + 2960344034U, // <3,2,4,4>: Cost 3 vzipr <1,2,3,4>, <1,3,2,4> + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2129821696U, // <3,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0 + 3177693185U, // <3,2,4,7>: Cost 3 ins <3,u,4,7>, lane 1 + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316170U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, <0,3,2,5> + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2104016897U, // <3,2,5,6>: Cost 2 ins <3,u,5,6>, lane 1 + 2826554678U, // <3,2,5,7>: Cost 3 vuzpr <1,3,0,2>, RHS + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2129977344U, // <3,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0 + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 3095397270U, // <3,2,7,0>: Cost 3 vtrnr <1,3,5,7>, <1,2,3,0> + 3203743744U, // <3,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0 + 3095396516U, // <3,2,7,2>: Cost 3 vtrnr <1,3,5,7>, <0,2,0,2> + 1888616550U, // <3,2,7,3>: Cost 2 vzipr <1,5,3,7>, LHS + 3095397274U, // <3,2,7,4>: Cost 3 vtrnr <1,3,5,7>, <1,2,3,4> + 3095396528U, // <3,2,7,5>: Cost 3 vtrnr <1,3,5,7>, <0,2,1,5> + 1155286754U, // <3,2,7,6>: Cost 2 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 1888616555U, // <3,2,7,u>: Cost 2 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2129494016U, // <3,2,u,2>: Cost 2 ins <u,2,0,2>, lane 0 + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2129821696U, // <3,2,u,6>: Cost 2 ins <u,2,4,6>, lane 0 + 2129977344U, // <3,2,u,7>: Cost 2 ins <u,2,6,7>, lane 0 + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 1886569366U, // <3,3,0,0>: Cost 2 vzipr <1,2,3,0>, <1,2,3,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 1697874022U, // <3,3,0,2>: Cost 2 vuzpl <3,3,3,3>, LHS + 2100895746U, // <3,3,0,3>: Cost 2 ins <3,3,u,3>, lane 2 + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 3041151490U, // <3,3,0,5>: Cost 3 vtrnl <3,5,0,2>, <3,4,5,6> + 3177390081U, // <3,3,0,6>: Cost 3 ins <3,u,0,6>, lane 1 + 2960311440U, // <3,3,0,7>: Cost 3 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2103689217U, // <3,3,1,2>: Cost 2 ins <3,u,1,2>, lane 1 + 1752891494U, // <3,3,1,3>: Cost 2 vuzpr <1,3,1,3>, LHS + 2826635515U, // <3,3,1,4>: Cost 3 vuzpr <1,3,1,3>, <3,1,3,4> + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3177463809U, // <3,3,1,6>: Cost 3 ins <3,u,1,6>, lane 1 + 3100951552U, // <3,3,1,7>: Cost 3 vtrnr <2,3,0,1>, <1,3,5,7> + 1752891499U, // <3,3,1,u>: Cost 2 vuzpr <1,3,1,3>, LHS + 2959000470U, // <3,3,2,0>: Cost 3 vzipr <1,0,3,2>, <1,2,3,0> + 2959000471U, // <3,3,2,1>: Cost 3 vzipr <1,0,3,2>, <1,2,3,1> + 1885258486U, // <3,3,2,2>: Cost 2 vzipr <1,0,3,2>, <1,0,3,2> + 2130313216U, // <3,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0 + 2959000474U, // <3,3,2,4>: Cost 3 vzipr <1,0,3,2>, <1,2,3,4> + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2959000720U, // <3,3,2,7>: Cost 3 vzipr <1,0,3,2>, <1,5,3,7> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2100568067U, // <3,3,3,1>: Cost 2 ins <3,3,3,u>, lane 3 + 2100568067U, // <3,3,3,2>: Cost 2 ins <3,3,3,u>, lane 3 + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2100568067U, // <3,3,3,5>: Cost 2 ins <3,3,3,u>, lane 3 + 2100568067U, // <3,3,3,6>: Cost 2 ins <3,3,3,u>, lane 3 + 2100568067U, // <3,3,3,7>: Cost 2 ins <3,3,3,u>, lane 3 + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2960343958U, // <3,3,4,0>: Cost 3 vzipr <1,2,3,4>, <1,2,3,0> + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2960343798U, // <3,3,4,2>: Cost 3 vzipr <1,2,3,4>, <1,0,3,2> + 2100895746U, // <3,3,4,3>: Cost 2 ins <3,3,u,3>, lane 2 + 1886602138U, // <3,3,4,4>: Cost 2 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 1697877302U, // <3,3,4,6>: Cost 2 vuzpl <3,3,3,3>, RHS + 2960344208U, // <3,3,4,7>: Cost 3 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2100895746U, // <3,3,5,3>: Cost 2 ins <3,3,u,3>, lane 2 + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 2027538126U, // <3,3,5,5>: Cost 2 vtrnr <2,3,4,5>, <2,3,4,5> + 2104016897U, // <3,3,5,6>: Cost 2 ins <3,u,5,6>, lane 1 + 1752894774U, // <3,3,5,7>: Cost 2 vuzpr <1,3,1,3>, RHS + 1752894775U, // <3,3,5,u>: Cost 2 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3204333568U, // <3,3,6,1>: Cost 3 ins <u,3,6,1>, lane 0 + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2100895746U, // <3,3,6,3>: Cost 2 ins <3,3,u,3>, lane 2 + 2234845608U, // <3,3,6,4>: Cost 3 vrev <3,3,4,6> + 3204366336U, // <3,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0 + 1967893085U, // <3,3,6,6>: Cost 2 vtrnl <3,5,6,7>, <3,5,6,7> + 2130640896U, // <3,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0 + 2100895746U, // <3,3,6,u>: Cost 2 ins <3,3,u,3>, lane 2 + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2962359030U, // <3,3,7,2>: Cost 3 vzipr <1,5,3,7>, <1,0,3,2> + 2100895746U, // <3,3,7,3>: Cost 2 ins <3,3,u,3>, lane 2 + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 3095398094U, // <3,3,7,5>: Cost 3 vtrnr <1,3,5,7>, <2,3,4,5> + 3174662146U, // <3,3,7,6>: Cost 3 ins <3,3,u,6>, lane 2 + 2021655552U, // <3,3,7,7>: Cost 2 vtrnr <1,3,5,7>, <1,3,5,7> + 2021655552U, // <3,3,7,u>: Cost 2 vtrnr <1,3,5,7>, <1,3,5,7> + 1886569366U, // <3,3,u,0>: Cost 2 vzipr <1,2,3,0>, <1,2,3,0> + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1697879854U, // <3,3,u,2>: Cost 2 vuzpl <3,3,3,3>, LHS + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 1697880218U, // <3,3,u,6>: Cost 2 vuzpl <3,3,3,3>, RHS + 1752895017U, // <3,3,u,7>: Cost 2 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3177365505U, // <3,4,0,3>: Cost 3 ins <3,u,0,3>, lane 1 + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1829948726U, // <3,4,0,5>: Cost 2 vzipl <3,0,1,2>, RHS + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3177398273U, // <3,4,0,7>: Cost 3 ins <3,u,0,7>, lane 1 + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2820669542U, // <3,4,1,3>: Cost 3 vuzpr <0,3,1,4>, LHS + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2130919424U, // <3,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0 + 1964166454U, // <3,4,1,6>: Cost 2 vtrnl <3,0,1,2>, RHS + 3177472001U, // <3,4,1,7>: Cost 3 ins <3,u,1,7>, lane 1 + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3204694016U, // <3,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0 + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2101600261U, // <3,4,2,3>: Cost 2 ins <3,4,u,u>, lane 5 + 2826716058U, // <3,4,2,4>: Cost 3 vuzpr <1,3,2,4>, <1,2,3,4> + 2959001294U, // <3,4,2,5>: Cost 3 vzipr <1,0,3,2>, <2,3,4,5> + 2131001344U, // <3,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 3177545729U, // <3,4,2,7>: Cost 3 ins <3,u,2,7>, lane 1 + 2101600261U, // <3,4,2,u>: Cost 2 ins <3,4,u,u>, lane 5 + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2103844865U, // <3,4,3,3>: Cost 2 ins <3,u,3,3>, lane 1 + 2820669696U, // <3,4,3,4>: Cost 3 vuzpr <0,3,1,4>, <0,3,1,4> + 1832095030U, // <3,4,3,5>: Cost 2 vzipl <3,3,3,3>, RHS + 1966312758U, // <3,4,3,6>: Cost 2 vtrnl <3,3,3,3>, RHS + 3177619457U, // <3,4,3,7>: Cost 3 ins <3,u,3,7>, lane 1 + 1832095273U, // <3,4,3,u>: Cost 2 vzipl <3,3,3,3>, RHS + 2960344777U, // <3,4,4,0>: Cost 3 vzipr <1,2,3,4>, <2,3,4,0> + 2960344778U, // <3,4,4,1>: Cost 3 vzipr <1,2,3,4>, <2,3,4,1> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2960344618U, // <3,4,4,3>: Cost 3 vzipr <1,2,3,4>, <2,1,4,3> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3177693185U, // <3,4,4,7>: Cost 3 ins <3,u,4,7>, lane 1 + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2101379075U, // <3,4,5,1>: Cost 2 ins <3,4,5,u>, lane 3 + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2101379075U, // <3,4,5,3>: Cost 2 ins <3,4,5,u>, lane 3 + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2131214336U, // <3,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0 + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2101379075U, // <3,4,5,7>: Cost 2 ins <3,4,5,u>, lane 3 + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 1659227468U, // <3,4,6,0>: Cost 2 vext3 LHS, <4,6,0,2> + 2689838422U, // <3,4,6,1>: Cost 3 vext3 LHS, <4,6,1,3> + 2564417231U, // <3,4,6,2>: Cost 3 vext1 <2,3,4,6>, <2,3,4,6> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2131296256U, // <3,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0 + 2101600261U, // <3,4,6,7>: Cost 2 ins <3,4,u,u>, lane 5 + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 2659972191U, // <3,4,7,1>: Cost 3 vext2 <7,1,3,4>, <7,1,3,4> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3177881601U, // <3,4,7,3>: Cost 3 ins <3,u,7,3>, lane 1 + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 3095396690U, // <3,4,7,5>: Cost 3 vtrnr <1,3,5,7>, <0,4,1,5> + 2131369984U, // <3,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2131369984U, // <3,4,7,u>: Cost 2 ins <u,4,7,6>, lane 0 + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2101600261U, // <3,4,u,3>: Cost 2 ins <3,4,u,u>, lane 5 + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2101379075U, // <3,4,u,7>: Cost 2 ins <3,4,5,u>, lane 3 + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 2832842752U, // <3,5,0,0>: Cost 3 vuzpr <2,3,4,5>, <0,0,0,0> + 2131476480U, // <3,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0 + 1698709606U, // <3,5,0,2>: Cost 2 vuzpl <3,4,5,6>, LHS + 2772451522U, // <3,5,0,3>: Cost 3 vuzpl <3,4,5,6>, <0,2,3,5> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2960310647U, // <3,5,0,6>: Cost 3 vzipr <1,2,3,0>, <0,4,5,6> + 2131525632U, // <3,5,0,7>: Cost 2 ins <u,5,0,7>, lane 0 + 1698709660U, // <3,5,0,u>: Cost 2 vuzpl <3,4,5,6>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 2832843572U, // <3,5,1,1>: Cost 3 vuzpr <2,3,4,5>, <1,1,1,1> + 2103689217U, // <3,5,1,2>: Cost 2 ins <3,u,1,2>, lane 1 + 1759101030U, // <3,5,1,3>: Cost 2 vuzpr <2,3,4,5>, LHS + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2772452352U, // <3,5,1,5>: Cost 3 vuzpl <3,4,5,6>, <1,3,5,7> + 3205332992U, // <3,5,1,6>: Cost 3 ins <u,5,1,6>, lane 0 + 2027212086U, // <3,5,1,7>: Cost 2 vtrnr <2,3,0,1>, RHS + 2027212087U, // <3,5,1,u>: Cost 2 vtrnr <2,3,0,1>, RHS + 2832843670U, // <3,5,2,0>: Cost 3 vuzpr <2,3,4,5>, <1,2,3,0> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 2832842916U, // <3,5,2,2>: Cost 3 vuzpr <2,3,4,5>, <0,2,0,2> + 2131640320U, // <3,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0 + 2832842936U, // <3,5,2,4>: Cost 3 vuzpr <2,3,4,5>, <0,2,2,4> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 2959002114U, // <3,5,2,6>: Cost 3 vzipr <1,0,3,2>, <3,4,5,6> + 2131673088U, // <3,5,2,7>: Cost 2 ins <u,5,2,7>, lane 0 + 2131640320U, // <3,5,2,u>: Cost 2 ins <u,5,2,3>, lane 0 + 2772453922U, // <3,5,3,0>: Cost 3 vuzpl <3,4,5,6>, <3,5,0,2> + 2832844454U, // <3,5,3,1>: Cost 3 vuzpr <2,3,4,5>, <2,3,0,1> + 3177578497U, // <3,5,3,2>: Cost 3 ins <3,u,3,2>, lane 1 + 2103844865U, // <3,5,3,3>: Cost 2 ins <3,u,3,3>, lane 1 + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 1759102670U, // <3,5,3,5>: Cost 2 vuzpr <2,3,4,5>, <2,3,4,5> + 2959673858U, // <3,5,3,6>: Cost 3 vzipr <1,1,3,3>, <3,4,5,6> + 2021330230U, // <3,5,3,7>: Cost 2 vtrnr <1,3,1,3>, RHS + 2021330231U, // <3,5,3,u>: Cost 2 vtrnr <1,3,1,3>, RHS + 2832845308U, // <3,5,4,0>: Cost 3 vuzpr <2,3,4,5>, <3,4,5,0> + 2732969871U, // <3,5,4,1>: Cost 3 vext3 LHS, <5,4,1,5> + 2832844536U, // <3,5,4,2>: Cost 3 vuzpr <2,3,4,5>, <2,4,0,2> + 3177660417U, // <3,5,4,3>: Cost 3 ins <3,u,4,3>, lane 1 + 2832845312U, // <3,5,4,4>: Cost 3 vuzpr <2,3,4,5>, <3,4,5,4> + 2131804160U, // <3,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0 + 1698712886U, // <3,5,4,6>: Cost 2 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1698712904U, // <3,5,4,u>: Cost 2 vuzpl <3,4,5,6>, RHS + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2832846074U, // <3,5,5,1>: Cost 3 vuzpr <2,3,4,5>, <4,5,0,1> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2832845356U, // <3,5,5,3>: Cost 3 vuzpr <2,3,4,5>, <3,5,1,3> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2104016897U, // <3,5,5,6>: Cost 2 ins <3,u,5,6>, lane 1 + 1759104310U, // <3,5,5,7>: Cost 2 vuzpr <2,3,4,5>, RHS + 1759104311U, // <3,5,5,u>: Cost 2 vuzpr <2,3,4,5>, RHS + 2131910656U, // <3,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0 + 2131918848U, // <3,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2131927040U, // <3,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2131935232U, // <3,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0 + 2131943424U, // <3,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0 + 2131951616U, // <3,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 2131959808U, // <3,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0 + 1058226176U, // <3,5,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <3,5,6,u>: Cost 1 ins RHS, lane 0 + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 3095397528U, // <3,5,7,6>: Cost 3 vtrnr <1,3,5,7>, <1,5,4,6> + 2021657910U, // <3,5,7,7>: Cost 2 vtrnr <1,3,5,7>, RHS + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 1698715438U, // <3,5,u,2>: Cost 2 vuzpl <3,4,5,6>, LHS + 1759101597U, // <3,5,u,3>: Cost 2 vuzpr <2,3,4,5>, LHS + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 1698715802U, // <3,5,u,6>: Cost 2 vuzpl <3,4,5,6>, RHS + 1058226176U, // <3,5,u,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <3,5,u,u>: Cost 1 ins RHS, lane 0 + 2732970264U, // <3,6,0,0>: Cost 3 vext3 LHS, <6,0,0,2> + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2132148224U, // <3,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0 + 3177365505U, // <3,6,0,3>: Cost 3 ins <3,u,0,3>, lane 1 + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 1886571830U, // <3,6,0,7>: Cost 2 vzipr <1,2,3,0>, RHS + 1886571831U, // <3,6,0,u>: Cost 2 vzipr <1,2,3,0>, RHS + 2720878954U, // <3,6,1,0>: Cost 3 vext3 <6,1,0,3>, <6,1,0,3> + 3205955584U, // <3,6,1,1>: Cost 3 ins <u,6,1,1>, lane 0 + 2103689217U, // <3,6,1,2>: Cost 2 ins <3,u,1,2>, lane 1 + 2826731622U, // <3,6,1,3>: Cost 3 vuzpr <1,3,2,6>, LHS + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3205988352U, // <3,6,1,5>: Cost 3 ins <u,6,1,5>, lane 0 + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2954349878U, // <3,6,1,7>: Cost 3 vzipr <0,2,3,1>, RHS + 2103689217U, // <3,6,1,u>: Cost 2 ins <3,u,1,2>, lane 1 + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 2132303872U, // <3,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0 + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2826731724U, // <3,6,2,6>: Cost 3 vuzpr <1,3,2,6>, <0,2,4,6> + 1885261110U, // <3,6,2,7>: Cost 2 vzipr <1,0,3,2>, RHS + 1885261111U, // <3,6,2,u>: Cost 2 vzipr <1,0,3,2>, RHS + 3136876642U, // <3,6,3,0>: Cost 3 vtrnr <u,3,1,3>, <5,6,7,0> + 3206103040U, // <3,6,3,1>: Cost 3 ins <u,6,3,1>, lane 0 + 3001478044U, // <3,6,3,2>: Cost 3 vzipr <u,1,3,3>, <4,0,6,2> + 2103844865U, // <3,6,3,3>: Cost 2 ins <3,u,3,3>, lane 1 + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3206135808U, // <3,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0 + 1699457629U, // <3,6,3,6>: Cost 2 vuzpl <3,5,6,7>, <3,5,6,7> + 1885932854U, // <3,6,3,7>: Cost 2 vzipr <1,1,3,3>, RHS + 1885932855U, // <3,6,3,u>: Cost 2 vzipr <1,1,3,3>, RHS + 2732970588U, // <3,6,4,0>: Cost 3 vext3 LHS, <6,4,0,2> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970604U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,0> + 2906673714U, // <3,6,4,3>: Cost 3 vzipl <3,4,5,6>, <6,3,4,5> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2132475904U, // <3,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0 + 1886604598U, // <3,6,4,7>: Cost 2 vzipr <1,2,3,4>, RHS + 1886604599U, // <3,6,4,u>: Cost 2 vzipr <1,2,3,4>, RHS + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3206250496U, // <3,6,5,1>: Cost 3 ins <u,6,5,1>, lane 0 + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 3040891442U, // <3,6,5,4>: Cost 3 vtrnl <3,4,5,6>, <6,3,4,5> + 3206283264U, // <3,6,5,5>: Cost 3 ins <u,6,5,5>, lane 0 + 2104016897U, // <3,6,5,6>: Cost 2 ins <3,u,5,6>, lane 1 + 2954382646U, // <3,6,5,7>: Cost 3 vzipr <0,2,3,5>, RHS + 2104016897U, // <3,6,5,u>: Cost 2 ins <3,u,5,6>, lane 1 + 2732970748U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,0> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2732970768U, // <3,6,6,2>: Cost 3 vext3 LHS, <6,6,2,2> + 3177807873U, // <3,6,6,3>: Cost 3 ins <3,u,6,3>, lane 1 + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 1611453282U, // <3,6,7,2>: Cost 2 vext3 LHS, <6,7,2,3> + 2968996198U, // <3,6,7,3>: Cost 3 vzipr <2,6,3,7>, <3,2,6,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2968995633U, // <3,6,7,5>: Cost 3 vzipr <2,6,3,7>, <2,4,6,5> + 1611453322U, // <3,6,7,6>: Cost 2 vext3 LHS, <6,7,6,7> + 1888619830U, // <3,6,7,7>: Cost 2 vzipr <1,5,3,7>, RHS + 1888619831U, // <3,6,7,u>: Cost 2 vzipr <1,5,3,7>, RHS + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2132148224U, // <3,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0 + 2132303872U, // <3,6,u,3>: Cost 2 ins <u,6,2,3>, lane 0 + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 2132475904U, // <3,6,u,6>: Cost 2 ins <u,6,4,6>, lane 0 + 1885310262U, // <3,6,u,7>: Cost 2 vzipr <1,0,3,u>, RHS + 1885310263U, // <3,6,u,u>: Cost 2 vzipr <1,0,3,u>, RHS + 2826960896U, // <3,7,0,0>: Cost 3 vuzpr <1,3,5,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2826960916U, // <3,7,0,2>: Cost 3 vuzpr <1,3,5,7>, <0,0,2,2> + 3002117840U, // <3,7,0,3>: Cost 3 vzipr <u,2,3,0>, <5,1,7,3> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2826961716U, // <3,7,1,1>: Cost 3 vuzpr <1,3,5,7>, <1,1,1,1> + 2103689217U, // <3,7,1,2>: Cost 2 ins <3,u,1,2>, lane 1 + 1753219174U, // <3,7,1,3>: Cost 2 vuzpr <1,3,5,7>, LHS + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1753219179U, // <3,7,1,u>: Cost 2 vuzpr <1,3,5,7>, LHS + 2826961814U, // <3,7,2,0>: Cost 3 vuzpr <1,3,5,7>, <1,2,3,0> + 3206692864U, // <3,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0 + 2826961060U, // <3,7,2,2>: Cost 3 vuzpr <1,3,5,7>, <0,2,0,2> + 2132967424U, // <3,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0 + 2826961818U, // <3,7,2,4>: Cost 3 vuzpr <1,3,5,7>, <1,2,3,4> + 2826961072U, // <3,7,2,5>: Cost 3 vuzpr <1,3,5,7>, <0,2,1,5> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2826962598U, // <3,7,3,1>: Cost 3 vuzpr <1,3,5,7>, <2,3,0,1> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2103844865U, // <3,7,3,3>: Cost 2 ins <3,u,3,3>, lane 1 + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2826962638U, // <3,7,3,5>: Cost 3 vuzpr <1,3,5,7>, <2,3,4,5> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 1753220096U, // <3,7,3,7>: Cost 2 vuzpr <1,3,5,7>, <1,3,5,7> + 1753220096U, // <3,7,3,u>: Cost 2 vuzpr <1,3,5,7>, <1,3,5,7> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3002150608U, // <3,7,4,3>: Cost 3 vzipr <u,2,3,4>, <5,1,7,3> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2826961244U, // <3,7,4,6>: Cost 3 vuzpr <1,3,5,7>, <0,4,2,6> + 2732971383U, // <3,7,4,7>: Cost 3 vext3 LHS, <7,4,7,5> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2826963551U, // <3,7,5,0>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,0> + 2826963552U, // <3,7,5,1>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,1> + 2826962032U, // <3,7,5,2>: Cost 3 vuzpr <1,3,5,7>, <1,5,0,2> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2826963555U, // <3,7,5,4>: Cost 3 vuzpr <1,3,5,7>, <3,5,7,4> + 2826962044U, // <3,7,5,5>: Cost 3 vuzpr <1,3,5,7>, <1,5,1,5> + 2104016897U, // <3,7,5,6>: Cost 2 ins <3,u,5,6>, lane 1 + 1753222454U, // <3,7,5,7>: Cost 2 vuzpr <1,3,5,7>, RHS + 1753222455U, // <3,7,5,u>: Cost 2 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2133295104U, // <3,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0 + 2133295104U, // <3,7,6,u>: Cost 2 ins <u,7,6,7>, lane 0 + 2962362223U, // <3,7,7,0>: Cost 3 vzipr <1,5,3,7>, <5,3,7,0> + 2826965109U, // <3,7,7,1>: Cost 3 vuzpr <1,3,5,7>, <5,7,0,1> + 2968998474U, // <3,7,7,2>: Cost 3 vzipr <2,6,3,7>, <6,3,7,2> + 2826963662U, // <3,7,7,3>: Cost 3 vuzpr <1,3,5,7>, <3,7,1,3> + 2962362227U, // <3,7,7,4>: Cost 3 vzipr <1,5,3,7>, <5,3,7,4> + 2826965149U, // <3,7,7,5>: Cost 3 vuzpr <1,3,5,7>, <5,7,4,5> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2826962300U, // <3,7,u,0>: Cost 3 vuzpr <1,3,5,7>, <1,u,3,0> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2103689217U, // <3,7,u,2>: Cost 2 ins <3,u,1,2>, lane 1 + 1753219741U, // <3,7,u,3>: Cost 2 vuzpr <1,3,5,7>, LHS + 2826962304U, // <3,7,u,4>: Cost 3 vuzpr <1,3,5,7>, <1,u,3,4> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1753222697U, // <3,7,u,7>: Cost 2 vuzpr <1,3,5,7>, RHS + 1753219746U, // <3,7,u,u>: Cost 2 vuzpr <1,3,5,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1696243814U, // <3,u,0,2>: Cost 2 vuzpl <3,0,u,2>, LHS + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1829951642U, // <3,u,0,5>: Cost 2 vzipl <3,0,1,2>, RHS + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 1886571848U, // <3,u,0,7>: Cost 2 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 1964169370U, // <3,u,1,6>: Cost 2 vtrnl <3,0,1,2>, RHS + 2027212329U, // <3,u,1,7>: Cost 2 vtrnr <2,3,0,1>, RHS + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1659672428U, // <3,u,2,0>: Cost 2 vext3 LHS, <u,2,0,2> + 2128969728U, // <3,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1055244288U, // <3,u,2,3>: Cost 1 ins LHS, lane 0 + 1659672468U, // <3,u,2,4>: Cost 2 vext3 LHS, <u,2,4,6> + 2129002496U, // <3,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1885261128U, // <3,u,2,7>: Cost 2 vzipr <1,0,3,2>, RHS + 1055244288U, // <3,u,2,u>: Cost 1 ins LHS, lane 0 + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1616541639U, // <3,u,3,1>: Cost 2 vext3 LHS, <u,3,1,3> + 1966315310U, // <3,u,3,2>: Cost 2 vtrnl <3,3,3,3>, LHS + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1966315674U, // <3,u,3,6>: Cost 2 vtrnl <3,3,3,3>, RHS + 1885932872U, // <3,u,3,7>: Cost 2 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2960344003U, // <3,u,4,0>: Cost 3 vzipr <1,2,3,4>, <1,2,u,0> + 1832933166U, // <3,u,4,1>: Cost 2 vzipl <3,4,5,6>, LHS + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1886601372U, // <3,u,4,3>: Cost 2 vzipr <1,2,3,4>, LHS + 1886602138U, // <3,u,4,4>: Cost 2 vzipr <1,2,3,4>, <1,2,3,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1696247094U, // <3,u,4,6>: Cost 2 vuzpl <3,0,u,2>, RHS + 1886604616U, // <3,u,4,7>: Cost 2 vzipr <1,2,3,4>, RHS + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2128527360U, // <3,u,5,1>: Cost 2 ins <u,0,5,1>, lane 0 + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 2027538126U, // <3,u,5,5>: Cost 2 vtrnr <2,3,4,5>, <2,3,4,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1752935734U, // <3,u,5,7>: Cost 2 vuzpr <1,3,1,u>, RHS + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 1663875248U, // <3,u,6,0>: Cost 2 vext3 LHS, <u,6,0,2> + 2131918848U, // <3,u,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2128609280U, // <3,u,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1663875288U, // <3,u,6,4>: Cost 2 vext3 LHS, <u,6,4,6> + 2131951616U, // <3,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 2131296256U, // <3,u,6,6>: Cost 2 ins <u,4,6,6>, lane 0 + 1058226176U, // <3,u,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <3,u,6,u>: Cost 1 ins RHS, lane 0 + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2098896898U, // <3,u,7,2>: Cost 2 ins <3,0,u,2>, lane 2 + 2021655197U, // <3,u,7,3>: Cost 2 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659230515U, // <3,u,7,5>: Cost 2 vext3 LHS, <u,7,5,7> + 2131369984U, // <3,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 2021658153U, // <3,u,7,7>: Cost 2 vtrnr <1,3,5,7>, RHS + 2021655202U, // <3,u,7,u>: Cost 2 vtrnr <1,3,5,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 1055244288U, // <3,u,u,3>: Cost 1 ins LHS, lane 0 + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1058226176U, // <3,u,u,7>: Cost 1 ins RHS, lane 0 + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2128150528U, // <4,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0 + 2104860674U, // <4,0,0,1>: Cost 2 ins <4,0,u,1>, lane 2 + 1705607270U, // <4,0,0,2>: Cost 2 vuzpl <4,6,0,2>, LHS + 3178070019U, // <4,0,0,3>: Cost 3 ins <4,0,0,u>, lane 3 + 2909946194U, // <4,0,0,4>: Cost 3 vzipl <4,0,5,1>, <0,4,1,5> + 3178070019U, // <4,0,0,5>: Cost 3 ins <4,0,0,u>, lane 3 + 3183362049U, // <4,0,0,6>: Cost 3 ins <4,u,0,6>, lane 1 + 2109628417U, // <4,0,0,7>: Cost 2 ins <4,u,0,7>, lane 1 + 1705607324U, // <4,0,0,u>: Cost 2 vuzpl <4,6,0,2>, LHS + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2128232448U, // <4,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0 + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2833612902U, // <4,0,1,3>: Cost 3 vuzpr <2,4,6,0>, LHS + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2779350016U, // <4,0,1,5>: Cost 3 vuzpl <4,6,0,2>, <1,3,5,7> + 3202015232U, // <4,0,1,6>: Cost 3 ins <u,0,1,6>, lane 0 + 2109702145U, // <4,0,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 2104860674U, // <4,0,2,1>: Cost 2 ins <4,0,u,1>, lane 2 + 2128314368U, // <4,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0 + 2104918021U, // <4,0,2,3>: Cost 2 ins <4,0,u,u>, lane 5 + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3044622465U, // <4,0,2,5>: Cost 3 vtrnl <4,1,2,3>, <0,1,5,3> + 2833613004U, // <4,0,2,6>: Cost 3 vuzpr <2,4,6,0>, <0,2,4,6> + 2109775873U, // <4,0,2,7>: Cost 2 ins <4,u,2,7>, lane 1 + 2104860674U, // <4,0,2,u>: Cost 2 ins <4,0,u,1>, lane 2 + 3202113536U, // <4,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0 + 2104860674U, // <4,0,3,1>: Cost 2 ins <4,0,u,1>, lane 2 + 2128388096U, // <4,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 2779351452U, // <4,0,3,3>: Cost 3 vuzpl <4,6,0,2>, <3,3,3,3> + 3178627074U, // <4,0,3,4>: Cost 3 ins <4,0,u,4>, lane 2 + 2839512782U, // <4,0,3,5>: Cost 3 vuzpr <3,4,5,0>, <2,3,4,5> + 3178643458U, // <4,0,3,6>: Cost 3 ins <4,0,u,6>, lane 2 + 2109849601U, // <4,0,3,7>: Cost 2 ins <4,u,3,7>, lane 1 + 2104860674U, // <4,0,3,u>: Cost 2 ins <4,0,u,1>, lane 2 + 1705610572U, // <4,0,4,0>: Cost 2 vuzpl <4,6,0,2>, <4,6,0,2> + 2104860674U, // <4,0,4,1>: Cost 2 ins <4,0,u,1>, lane 2 + 1974370406U, // <4,0,4,2>: Cost 2 vtrnl <4,6,4,6>, LHS + 3178364931U, // <4,0,4,3>: Cost 3 ins <4,0,4,u>, lane 3 + 2109898753U, // <4,0,4,4>: Cost 2 ins <4,u,4,4>, lane 1 + 2104918021U, // <4,0,4,5>: Cost 2 ins <4,0,u,u>, lane 5 + 1705610550U, // <4,0,4,6>: Cost 2 vuzpl <4,6,0,2>, RHS + 2109923329U, // <4,0,4,7>: Cost 2 ins <4,u,4,7>, lane 1 + 1705610568U, // <4,0,4,u>: Cost 2 vuzpl <4,6,0,2>, RHS + 1839644672U, // <4,0,5,0>: Cost 2 vzipl RHS, <0,0,0,0> + 765902950U, // <4,0,5,1>: Cost 1 vzipl RHS, LHS + 1839644836U, // <4,0,5,2>: Cost 2 vzipl RHS, <0,2,0,2> + 2104696835U, // <4,0,5,3>: Cost 2 ins <4,0,5,u>, lane 3 + 1839645010U, // <4,0,5,4>: Cost 2 vzipl RHS, <0,4,1,5> + 2109980673U, // <4,0,5,5>: Cost 2 ins <4,u,5,5>, lane 1 + 2104696835U, // <4,0,5,6>: Cost 2 ins <4,0,5,u>, lane 3 + 2104696835U, // <4,0,5,7>: Cost 2 ins <4,0,5,u>, lane 3 + 765903517U, // <4,0,5,u>: Cost 1 vzipl RHS, LHS + 1973862400U, // <4,0,6,0>: Cost 2 vtrnl RHS, <0,0,0,0> + 1973862410U, // <4,0,6,1>: Cost 2 vtrnl RHS, <0,0,1,1> + 900120678U, // <4,0,6,2>: Cost 1 vtrnl RHS, LHS + 2104770563U, // <4,0,6,3>: Cost 2 ins <4,0,6,u>, lane 3 + 1973862604U, // <4,0,6,4>: Cost 2 vtrnl RHS, <0,2,4,6> + 2104770563U, // <4,0,6,5>: Cost 2 ins <4,0,6,u>, lane 3 + 2110062593U, // <4,0,6,6>: Cost 2 ins <4,u,6,6>, lane 1 + 1036328961U, // <4,0,6,7>: Cost 1 ins RHS, lane 1 + 900120732U, // <4,0,6,u>: Cost 1 vtrnl RHS, LHS + 3202408448U, // <4,0,7,0>: Cost 3 ins <u,0,7,0>, lane 0 + 2104860674U, // <4,0,7,1>: Cost 2 ins <4,0,u,1>, lane 2 + 2104868866U, // <4,0,7,2>: Cost 2 ins <4,0,u,2>, lane 2 + 3114049557U, // <4,0,7,3>: Cost 3 vtrnr <4,4,6,7>, <0,0,2,3> + 3178627074U, // <4,0,7,4>: Cost 3 ins <4,0,u,4>, lane 2 + 2779354470U, // <4,0,7,5>: Cost 3 vuzpl <4,6,0,2>, <7,4,5,6> + 2779354473U, // <4,0,7,6>: Cost 3 vuzpl <4,6,0,2>, <7,4,6,0> + 2110144513U, // <4,0,7,7>: Cost 2 ins <4,u,7,7>, lane 1 + 2104860674U, // <4,0,7,u>: Cost 2 ins <4,0,u,1>, lane 2 + 1974009856U, // <4,0,u,0>: Cost 2 vtrnl RHS, <0,0,0,0> + 767893606U, // <4,0,u,1>: Cost 1 vzipl RHS, LHS + 900268134U, // <4,0,u,2>: Cost 1 vtrnl RHS, LHS + 2104918021U, // <4,0,u,3>: Cost 2 ins <4,0,u,u>, lane 5 + 1974010060U, // <4,0,u,4>: Cost 2 vtrnl RHS, <0,2,4,6> + 2104918021U, // <4,0,u,5>: Cost 2 ins <4,0,u,u>, lane 5 + 1705613466U, // <4,0,u,6>: Cost 2 vuzpl <4,6,0,2>, RHS + 1036328961U, // <4,0,u,7>: Cost 1 ins RHS, lane 1 + 900268188U, // <4,0,u,u>: Cost 1 vtrnl RHS, LHS + 2600640614U, // <4,1,0,0>: Cost 3 vext1 <u,4,1,0>, LHS + 2128822272U, // <4,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0 + 2109587457U, // <4,1,0,2>: Cost 2 ins <4,u,0,2>, lane 1 + 2128838656U, // <4,1,0,3>: Cost 2 ins <u,1,0,3>, lane 0 + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3047785472U, // <4,1,0,5>: Cost 3 vtrnl <4,6,0,2>, <1,3,5,7> + 3183362049U, // <4,1,0,6>: Cost 3 ins <4,u,0,6>, lane 1 + 2109628417U, // <4,1,0,7>: Cost 2 ins <4,u,0,7>, lane 1 + 2109587457U, // <4,1,0,u>: Cost 2 ins <4,u,0,2>, lane 1 + 3202629632U, // <4,1,1,0>: Cost 3 ins <u,1,1,0>, lane 0 + 2128896000U, // <4,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0 + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2128912384U, // <4,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0 + 3202662400U, // <4,1,1,4>: Cost 3 ins <u,1,1,4>, lane 0 + 2958401874U, // <4,1,1,5>: Cost 3 vzipr <0,u,4,1>, <0,4,1,5> + 2778801323U, // <4,1,1,6>: Cost 3 vuzpl <4,5,1,7>, <1,5,6,7> + 2109702145U, // <4,1,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 2128896000U, // <4,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0 + 2128961536U, // <4,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0 + 2128969728U, // <4,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 2128977920U, // <4,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0 + 1055244288U, // <4,1,2,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <4,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0 + 2129002496U, // <4,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2129010688U, // <4,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2129018880U, // <4,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0 + 1055244288U, // <4,1,2,u>: Cost 1 ins LHS, lane 0 + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2129059840U, // <4,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0 + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2109849601U, // <4,1,3,7>: Cost 2 ins <4,u,3,7>, lane 1 + 2129059840U, // <4,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0 + 2600673382U, // <4,1,4,0>: Cost 3 vext1 <u,4,1,4>, LHS + 1705061641U, // <4,1,4,1>: Cost 2 vuzpl <4,5,1,7>, <4,5,1,7> + 2912641946U, // <4,1,4,2>: Cost 3 vzipl <4,4,5,6>, <1,2,3,4> + 2040135782U, // <4,1,4,3>: Cost 2 vtrnr <4,4,4,4>, LHS + 2109898753U, // <4,1,4,4>: Cost 2 ins <4,u,4,4>, lane 1 + 2129149952U, // <4,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0 + 2109915137U, // <4,1,4,6>: Cost 2 ins <4,u,4,6>, lane 1 + 2109923329U, // <4,1,4,7>: Cost 2 ins <4,u,4,7>, lane 1 + 2109915137U, // <4,1,4,u>: Cost 2 ins <4,u,4,6>, lane 1 + 1479164242U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, <0,4,1,5> + 1839645492U, // <4,1,5,1>: Cost 2 vzipl RHS, <1,1,1,1> + 1839645590U, // <4,1,5,2>: Cost 2 vzipl RHS, <1,2,3,0> + 2016034918U, // <4,1,5,3>: Cost 2 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839645840U, // <4,1,5,5>: Cost 2 vzipl RHS, <1,5,3,7> + 3089776763U, // <4,1,5,6>: Cost 3 vtrnr <0,4,1,5>, <0,1,4,6> + 2109997057U, // <4,1,5,7>: Cost 2 ins <4,u,5,7>, lane 1 + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2110013441U, // <4,1,6,0>: Cost 2 ins <4,u,6,0>, lane 1 + 1973863220U, // <4,1,6,1>: Cost 2 vtrnl RHS, <1,1,1,1> + 2110029825U, // <4,1,6,2>: Cost 2 ins <4,u,6,2>, lane 1 + 2016116838U, // <4,1,6,3>: Cost 2 vtrnr <0,4,2,6>, LHS + 2110046209U, // <4,1,6,4>: Cost 2 ins <4,u,6,4>, lane 1 + 1973863424U, // <4,1,6,5>: Cost 2 vtrnl RHS, <1,3,5,7> + 2110062593U, // <4,1,6,6>: Cost 2 ins <4,u,6,6>, lane 1 + 1036328961U, // <4,1,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,1,6,u>: Cost 1 ins RHS, lane 1 + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3203080192U, // <4,1,7,1>: Cost 3 ins <u,1,7,1>, lane 0 + 3203088384U, // <4,1,7,2>: Cost 3 ins <u,1,7,2>, lane 0 + 2129354752U, // <4,1,7,3>: Cost 2 ins <u,1,7,3>, lane 0 + 2664666470U, // <4,1,7,4>: Cost 3 vext2 <7,u,4,1>, <7,4,5,6> + 3203112960U, // <4,1,7,5>: Cost 3 ins <u,1,7,5>, lane 0 + 3114049641U, // <4,1,7,6>: Cost 3 vtrnr <4,4,6,7>, <0,1,2,6> + 2110144513U, // <4,1,7,7>: Cost 2 ins <4,u,7,7>, lane 1 + 2129354752U, // <4,1,7,u>: Cost 2 ins <u,1,7,3>, lane 0 + 1479188821U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, <0,4,1,u> + 1974010676U, // <4,1,u,1>: Cost 2 vtrnl RHS, <1,1,1,1> + 1841636246U, // <4,1,u,2>: Cost 2 vzipl RHS, <1,2,3,0> + 1055244288U, // <4,1,u,3>: Cost 1 ins LHS, lane 0 + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 1974010880U, // <4,1,u,5>: Cost 2 vtrnl RHS, <1,3,5,7> + 2109915137U, // <4,1,u,6>: Cost 2 ins <4,u,4,6>, lane 1 + 1036328961U, // <4,1,u,7>: Cost 1 ins RHS, lane 1 + 1055244288U, // <4,1,u,u>: Cost 1 ins LHS, lane 0 + 3047786150U, // <4,2,0,0>: Cost 3 vtrnl <4,6,0,2>, <2,3,0,1> + 2109579265U, // <4,2,0,1>: Cost 2 ins <4,u,0,1>, lane 1 + 2129494016U, // <4,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0 + 2967019622U, // <4,2,0,3>: Cost 3 vzipr <2,3,4,0>, LHS + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 2909947747U, // <4,2,0,5>: Cost 3 vzipl <4,0,5,1>, <2,5,3,1> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 2109628417U, // <4,2,0,7>: Cost 2 ins <4,u,0,7>, lane 1 + 2129494016U, // <4,2,0,u>: Cost 2 ins <u,2,0,2>, lane 0 + 3203293184U, // <4,2,1,0>: Cost 3 ins <u,2,1,0>, lane 0 + 3203301376U, // <4,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0 + 3203309568U, // <4,2,1,2>: Cost 3 ins <u,2,1,2>, lane 0 + 2821242982U, // <4,2,1,3>: Cost 3 vuzpr <0,4,0,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3203334144U, // <4,2,1,5>: Cost 3 ins <u,2,1,5>, lane 0 + 3203342336U, // <4,2,1,6>: Cost 3 ins <u,2,1,6>, lane 0 + 2109702145U, // <4,2,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 2109702145U, // <4,2,1,u>: Cost 2 ins <4,u,1,7>, lane 1 + 2229208824U, // <4,2,2,0>: Cost 3 vrev <2,4,0,2> + 2911397400U, // <4,2,2,1>: Cost 3 vzipl <4,2,6,7>, <2,1,2,3> + 2129641472U, // <4,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0 + 2129649664U, // <4,2,2,3>: Cost 2 ins <u,2,2,3>, lane 0 + 2697954940U, // <4,2,2,4>: Cost 3 vext3 <2,2,4,4>, <2,2,4,4> + 2911397764U, // <4,2,2,5>: Cost 3 vzipl <4,2,6,7>, <2,5,6,7> + 2821243084U, // <4,2,2,6>: Cost 3 vuzpr <0,4,0,2>, <0,2,4,6> + 2109775873U, // <4,2,2,7>: Cost 2 ins <4,u,2,7>, lane 1 + 2129641472U, // <4,2,2,u>: Cost 2 ins <u,2,2,2>, lane 0 + 2129698816U, // <4,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0 + 2229290754U, // <4,2,3,1>: Cost 3 vrev <2,4,1,3> + 3203457024U, // <4,2,3,2>: Cost 3 ins <u,2,3,2>, lane 0 + 2129723392U, // <4,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0 + 2129731584U, // <4,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0 + 2833188558U, // <4,2,3,5>: Cost 3 vuzpr <2,4,0,2>, <2,3,4,5> + 3203489792U, // <4,2,3,6>: Cost 3 ins <u,2,3,6>, lane 0 + 2109849601U, // <4,2,3,7>: Cost 2 ins <4,u,3,7>, lane 1 + 2129698816U, // <4,2,3,u>: Cost 2 ins <u,2,3,0>, lane 0 + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 1702448074U, // <4,2,4,2>: Cost 2 vuzpl <4,1,2,3>, <4,1,2,3> + 1905918054U, // <4,2,4,3>: Cost 2 vzipr <4,4,4,4>, LHS + 2109898753U, // <4,2,4,4>: Cost 2 ins <4,u,4,4>, lane 1 + 2109906945U, // <4,2,4,5>: Cost 2 ins <4,u,4,5>, lane 1 + 2129821696U, // <4,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0 + 2109923329U, // <4,2,4,7>: Cost 2 ins <4,u,4,7>, lane 1 + 2129821696U, // <4,2,4,u>: Cost 2 ins <u,2,4,6>, lane 0 + 3089777558U, // <4,2,5,0>: Cost 3 vtrnr <0,4,1,5>, <1,2,3,0> + 2109947905U, // <4,2,5,1>: Cost 2 ins <4,u,5,1>, lane 1 + 1839646312U, // <4,2,5,2>: Cost 2 vzipl RHS, <2,2,2,2> + 1893318758U, // <4,2,5,3>: Cost 2 vzipr <2,3,4,5>, LHS + 3089777562U, // <4,2,5,4>: Cost 3 vtrnr <0,4,1,5>, <1,2,3,4> + 2109980673U, // <4,2,5,5>: Cost 2 ins <4,u,5,5>, lane 1 + 1839646650U, // <4,2,5,6>: Cost 2 vzipl RHS, <2,6,3,7> + 2109997057U, // <4,2,5,7>: Cost 2 ins <4,u,5,7>, lane 1 + 1893318763U, // <4,2,5,u>: Cost 2 vzipr <2,3,4,5>, LHS + 1479246172U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, <0,4,2,6> + 2110021633U, // <4,2,6,1>: Cost 2 ins <4,u,6,1>, lane 1 + 1973864040U, // <4,2,6,2>: Cost 2 vtrnl RHS, <2,2,2,2> + 1880719462U, // <4,2,6,3>: Cost 2 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2110054401U, // <4,2,6,5>: Cost 2 ins <4,u,6,5>, lane 1 + 2110062593U, // <4,2,6,6>: Cost 2 ins <4,u,6,6>, lane 1 + 1036328961U, // <4,2,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,2,6,u>: Cost 1 ins RHS, lane 1 + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3203743744U, // <4,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0 + 3203751936U, // <4,2,7,2>: Cost 3 ins <u,2,7,2>, lane 0 + 2130018304U, // <4,2,7,3>: Cost 2 ins <u,2,7,3>, lane 0 + 3102032794U, // <4,2,7,4>: Cost 3 vtrnr <2,4,5,7>, <1,2,3,4> + 2229618474U, // <4,2,7,5>: Cost 3 vrev <2,4,5,7> + 3203784704U, // <4,2,7,6>: Cost 3 ins <u,2,7,6>, lane 0 + 2110144513U, // <4,2,7,7>: Cost 2 ins <4,u,7,7>, lane 1 + 2130018304U, // <4,2,7,u>: Cost 2 ins <u,2,7,3>, lane 0 + 1479262558U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, <0,4,2,u> + 2109947905U, // <4,2,u,1>: Cost 2 ins <4,u,5,1>, lane 1 + 1974011496U, // <4,2,u,2>: Cost 2 vtrnl RHS, <2,2,2,2> + 1880735846U, // <4,2,u,3>: Cost 2 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2109980673U, // <4,2,u,5>: Cost 2 ins <4,u,5,5>, lane 1 + 1841637306U, // <4,2,u,6>: Cost 2 vzipl RHS, <2,6,3,7> + 1036328961U, // <4,2,u,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,2,u,u>: Cost 1 ins RHS, lane 1 + 3203883008U, // <4,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0 + 2130149376U, // <4,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0 + 2109587457U, // <4,3,0,2>: Cost 2 ins <4,u,0,2>, lane 1 + 3047786908U, // <4,3,0,3>: Cost 3 vtrnl <4,6,0,2>, <3,3,3,3> + 2967020442U, // <4,3,0,4>: Cost 3 vzipr <2,3,4,0>, <1,2,3,4> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3183362049U, // <4,3,0,6>: Cost 3 ins <4,u,0,6>, lane 1 + 2109628417U, // <4,3,0,7>: Cost 2 ins <4,u,0,7>, lane 1 + 2130149376U, // <4,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0 + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3203964928U, // <4,3,1,1>: Cost 3 ins <u,3,1,1>, lane 0 + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2130239488U, // <4,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0 + 2967028634U, // <4,3,1,4>: Cost 3 vzipr <2,3,4,1>, <1,2,3,4> + 3203997696U, // <4,3,1,5>: Cost 3 ins <u,3,1,5>, lane 0 + 2821398633U, // <4,3,1,6>: Cost 3 vuzpr <0,4,2,3>, <0,1,2,6> + 2109702145U, // <4,3,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 2130239488U, // <4,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0 + 3204030464U, // <4,3,2,0>: Cost 3 ins <u,3,2,0>, lane 0 + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3204046848U, // <4,3,2,2>: Cost 3 ins <u,3,2,2>, lane 0 + 2130313216U, // <4,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0 + 2833269658U, // <4,3,2,4>: Cost 3 vuzpr <2,4,1,3>, <1,2,3,4> + 3101624014U, // <4,3,2,5>: Cost 3 vtrnr <2,4,0,2>, <2,3,4,5> + 3204079616U, // <4,3,2,6>: Cost 3 ins <u,3,2,6>, lane 0 + 2109775873U, // <4,3,2,7>: Cost 2 ins <4,u,2,7>, lane 1 + 2130313216U, // <4,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0 + 3204104192U, // <4,3,3,0>: Cost 3 ins <u,3,3,0>, lane 0 + 2779564182U, // <4,3,3,1>: Cost 3 vuzpl <4,6,3,1>, <3,0,1,2> + 2636810580U, // <4,3,3,2>: Cost 3 vext2 <3,2,4,3>, <3,2,4,3> + 2130386944U, // <4,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0 + 2965717914U, // <4,3,3,4>: Cost 3 vzipr <2,1,4,3>, <1,2,3,4> + 2779597314U, // <4,3,3,5>: Cost 3 vuzpl <4,6,3,5>, <3,4,5,6> + 2778950237U, // <4,3,3,6>: Cost 3 vuzpl <4,5,3,7>, <3,5,6,7> + 2109849601U, // <4,3,3,7>: Cost 2 ins <4,u,3,7>, lane 1 + 2130386944U, // <4,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0 + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 3183624193U, // <4,3,4,2>: Cost 3 ins <4,u,4,2>, lane 1 + 1747657049U, // <4,3,4,3>: Cost 2 vuzpr <0,4,2,3>, <0,4,2,3> + 2109898753U, // <4,3,4,4>: Cost 2 ins <4,u,4,4>, lane 1 + 2130477056U, // <4,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0 + 2109915137U, // <4,3,4,6>: Cost 2 ins <4,u,4,6>, lane 1 + 2109923329U, // <4,3,4,7>: Cost 2 ins <4,u,4,7>, lane 1 + 2130477056U, // <4,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0 + 1839646870U, // <4,3,5,0>: Cost 2 vzipl RHS, <3,0,1,2> + 2109947905U, // <4,3,5,1>: Cost 2 ins <4,u,5,1>, lane 1 + 2967061238U, // <4,3,5,2>: Cost 3 vzipr <2,3,4,5>, <1,0,3,2> + 1839647132U, // <4,3,5,3>: Cost 2 vzipl RHS, <3,3,3,3> + 1839647234U, // <4,3,5,4>: Cost 2 vzipl RHS, <3,4,5,6> + 2109980673U, // <4,3,5,5>: Cost 2 ins <4,u,5,5>, lane 1 + 2913389176U, // <4,3,5,6>: Cost 3 vzipl RHS, <3,6,0,7> + 2130567168U, // <4,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0 + 1839647518U, // <4,3,5,u>: Cost 2 vzipl RHS, <3,u,1,2> + 2110013441U, // <4,3,6,0>: Cost 2 ins <4,u,6,0>, lane 1 + 1973864598U, // <4,3,6,1>: Cost 2 vtrnl RHS, <3,0,1,2> + 2110029825U, // <4,3,6,2>: Cost 2 ins <4,u,6,2>, lane 1 + 1973864860U, // <4,3,6,3>: Cost 2 vtrnl RHS, <3,3,3,3> + 2110046209U, // <4,3,6,4>: Cost 2 ins <4,u,6,4>, lane 1 + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 2110062593U, // <4,3,6,6>: Cost 2 ins <4,u,6,6>, lane 1 + 1036328961U, // <4,3,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,3,6,u>: Cost 1 ins RHS, lane 1 + 3204399104U, // <4,3,7,0>: Cost 3 ins <u,3,7,0>, lane 0 + 3204407296U, // <4,3,7,1>: Cost 3 ins <u,3,7,1>, lane 0 + 2660701368U, // <4,3,7,2>: Cost 3 vext2 <7,2,4,3>, <7,2,4,3> + 3204423680U, // <4,3,7,3>: Cost 3 ins <u,3,7,3>, lane 0 + 2968404890U, // <4,3,7,4>: Cost 3 vzipr <2,5,4,7>, <1,2,3,4> + 3204440064U, // <4,3,7,5>: Cost 3 ins <u,3,7,5>, lane 0 + 2235664908U, // <4,3,7,6>: Cost 3 vrev <3,4,6,7> + 2110144513U, // <4,3,7,7>: Cost 2 ins <4,u,7,7>, lane 1 + 2110144513U, // <4,3,7,u>: Cost 2 ins <4,u,7,7>, lane 1 + 1841637526U, // <4,3,u,0>: Cost 2 vzipl RHS, <3,0,1,2> + 1974012054U, // <4,3,u,1>: Cost 2 vtrnl RHS, <3,0,1,2> + 2109587457U, // <4,3,u,2>: Cost 2 ins <4,u,0,2>, lane 1 + 1974012316U, // <4,3,u,3>: Cost 2 vtrnl RHS, <3,3,3,3> + 1841637890U, // <4,3,u,4>: Cost 2 vzipl RHS, <3,4,5,6> + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2109915137U, // <4,3,u,6>: Cost 2 ins <4,u,4,6>, lane 1 + 1036328961U, // <4,3,u,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,3,u,u>: Cost 1 ins RHS, lane 1 + 1974046028U, // <4,4,0,0>: Cost 2 vtrnl <4,6,0,2>, <4,6,0,2> + 2107572229U, // <4,4,0,1>: Cost 2 ins <4,4,u,u>, lane 5 + 1705934950U, // <4,4,0,2>: Cost 2 vuzpl <4,6,4,6>, LHS + 3180724227U, // <4,4,0,3>: Cost 3 ins <4,4,0,u>, lane 3 + 2107539458U, // <4,4,0,4>: Cost 2 ins <4,4,u,4>, lane 2 + 2107547650U, // <4,4,0,5>: Cost 2 ins <4,4,u,5>, lane 2 + 1974046006U, // <4,4,0,6>: Cost 2 vtrnl <4,6,0,2>, RHS + 2109628417U, // <4,4,0,7>: Cost 2 ins <4,u,0,7>, lane 1 + 1974046024U, // <4,4,0,u>: Cost 2 vtrnl <4,6,0,2>, RHS + 3204620288U, // <4,4,1,0>: Cost 3 ins <u,4,1,0>, lane 0 + 1836665802U, // <4,4,1,1>: Cost 2 vzipl <4,1,2,3>, <4,1,2,3> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 1771700326U, // <4,4,1,3>: Cost 2 vuzpr <4,4,4,4>, LHS + 2107539458U, // <4,4,1,4>: Cost 2 ins <4,4,u,4>, lane 2 + 2130919424U, // <4,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0 + 2107555842U, // <4,4,1,6>: Cost 2 ins <4,4,u,6>, lane 2 + 2109702145U, // <4,4,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 2130919424U, // <4,4,1,u>: Cost 2 ins <u,4,1,5>, lane 0 + 2779678374U, // <4,4,2,0>: Cost 3 vuzpl <4,6,4,6>, <2,3,0,1> + 3044625673U, // <4,4,2,1>: Cost 3 vtrnl <4,1,2,3>, <4,5,1,7> + 1970883530U, // <4,4,2,2>: Cost 2 vtrnl <4,1,2,3>, <4,1,2,3> + 2107572229U, // <4,4,2,3>: Cost 2 ins <4,4,u,u>, lane 5 + 2107539458U, // <4,4,2,4>: Cost 2 ins <4,4,u,4>, lane 2 + 2107547650U, // <4,4,2,5>: Cost 2 ins <4,4,u,5>, lane 2 + 2131001344U, // <4,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 2109775873U, // <4,4,2,7>: Cost 2 ins <4,u,2,7>, lane 1 + 2107572229U, // <4,4,2,u>: Cost 2 ins <4,4,u,u>, lane 5 + 3181248514U, // <4,4,3,0>: Cost 3 ins <4,4,u,0>, lane 2 + 2779678870U, // <4,4,3,1>: Cost 3 vuzpl <4,6,4,6>, <3,0,1,2> + 3181264898U, // <4,4,3,2>: Cost 3 ins <4,4,u,2>, lane 2 + 1880031352U, // <4,4,3,3>: Cost 2 vzipr <0,1,4,3>, <0,1,4,3> + 2107539458U, // <4,4,3,4>: Cost 2 ins <4,4,u,4>, lane 2 + 2107547650U, // <4,4,3,5>: Cost 2 ins <4,4,u,5>, lane 2 + 2107555842U, // <4,4,3,6>: Cost 2 ins <4,4,u,6>, lane 2 + 2109849601U, // <4,4,3,7>: Cost 2 ins <4,u,3,7>, lane 1 + 2107547650U, // <4,4,3,u>: Cost 2 ins <4,4,u,5>, lane 2 + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2107277315U, // <4,4,4,1>: Cost 2 ins <4,4,4,u>, lane 3 + 2107277315U, // <4,4,4,2>: Cost 2 ins <4,4,4,u>, lane 3 + 2107277315U, // <4,4,4,3>: Cost 2 ins <4,4,4,u>, lane 3 + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 2107547650U, // <4,4,4,5>: Cost 2 ins <4,4,u,5>, lane 2 + 1705938230U, // <4,4,4,6>: Cost 2 vuzpl <4,6,4,6>, RHS + 2109923329U, // <4,4,4,7>: Cost 2 ins <4,u,4,7>, lane 1 + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 1839647634U, // <4,4,5,0>: Cost 2 vzipl RHS, <4,0,5,1> + 2109947905U, // <4,4,5,1>: Cost 2 ins <4,u,5,1>, lane 1 + 2107351043U, // <4,4,5,2>: Cost 2 ins <4,4,5,u>, lane 3 + 2107351043U, // <4,4,5,3>: Cost 2 ins <4,4,5,u>, lane 3 + 1839647952U, // <4,4,5,4>: Cost 2 vzipl RHS, <4,4,4,4> + 765906230U, // <4,4,5,5>: Cost 1 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2107351043U, // <4,4,5,7>: Cost 2 ins <4,4,5,u>, lane 3 + 765906473U, // <4,4,5,u>: Cost 1 vzipl RHS, RHS + 1973865804U, // <4,4,6,0>: Cost 2 vtrnl RHS, <4,6,0,2> + 2107424771U, // <4,4,6,1>: Cost 2 ins <4,4,6,u>, lane 3 + 2110029825U, // <4,4,6,2>: Cost 2 ins <4,u,6,2>, lane 1 + 2107424771U, // <4,4,6,3>: Cost 2 ins <4,4,6,u>, lane 3 + 1973865680U, // <4,4,6,4>: Cost 2 vtrnl RHS, <4,4,4,4> + 1973865362U, // <4,4,6,5>: Cost 2 vtrnl RHS, <4,0,5,1> + 900123958U, // <4,4,6,6>: Cost 1 vtrnl RHS, RHS + 1036328961U, // <4,4,6,7>: Cost 1 ins RHS, lane 1 + 900123976U, // <4,4,6,u>: Cost 1 vtrnl RHS, RHS + 3181248514U, // <4,4,7,0>: Cost 3 ins <4,4,u,0>, lane 2 + 2779681786U, // <4,4,7,1>: Cost 3 vuzpl <4,6,4,6>, <7,0,1,2> + 3181264898U, // <4,4,7,2>: Cost 3 ins <4,4,u,2>, lane 2 + 2845442636U, // <4,4,7,3>: Cost 3 vuzpr <4,4,4,4>, <0,7,2,3> + 2107539458U, // <4,4,7,4>: Cost 2 ins <4,4,u,4>, lane 2 + 2107547650U, // <4,4,7,5>: Cost 2 ins <4,4,u,5>, lane 2 + 2131369984U, // <4,4,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 2040311013U, // <4,4,7,7>: Cost 2 vtrnr <4,4,6,7>, <4,4,6,7> + 2107547650U, // <4,4,7,u>: Cost 2 ins <4,4,u,5>, lane 2 + 1974013260U, // <4,4,u,0>: Cost 2 vtrnl RHS, <4,6,0,2> + 2107572229U, // <4,4,u,1>: Cost 2 ins <4,4,u,u>, lane 5 + 1705940782U, // <4,4,u,2>: Cost 2 vuzpl <4,6,4,6>, LHS + 2107572229U, // <4,4,u,3>: Cost 2 ins <4,4,u,u>, lane 5 + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 767896886U, // <4,4,u,5>: Cost 1 vzipl RHS, RHS + 900271414U, // <4,4,u,6>: Cost 1 vtrnl RHS, RHS + 1036328961U, // <4,4,u,7>: Cost 1 ins RHS, lane 1 + 900271432U, // <4,4,u,u>: Cost 1 vtrnl RHS, RHS + 2108170242U, // <4,5,0,0>: Cost 2 ins <4,5,u,0>, lane 2 + 1034493957U, // <4,5,0,1>: Cost 1 ins RHS, lane 5 + 1707294822U, // <4,5,0,2>: Cost 2 vuzpl <4,u,5,1>, LHS + 2108194818U, // <4,5,0,3>: Cost 2 ins <4,5,u,3>, lane 2 + 2108203010U, // <4,5,0,4>: Cost 2 ins <4,5,u,4>, lane 2 + 2108211202U, // <4,5,0,5>: Cost 2 ins <4,5,u,5>, lane 2 + 2108219394U, // <4,5,0,6>: Cost 2 ins <4,5,u,6>, lane 2 + 1034485762U, // <4,5,0,7>: Cost 1 ins RHS, lane 2 + 1034493957U, // <4,5,0,u>: Cost 1 ins RHS, lane 5 + 2108170242U, // <4,5,1,0>: Cost 2 ins <4,5,u,0>, lane 2 + 2133540868U, // <4,5,1,1>: Cost 2 ins <u,u,1,1>, lane 4 + 2133549060U, // <4,5,1,2>: Cost 2 ins <u,u,1,2>, lane 4 + 1747599462U, // <4,5,1,3>: Cost 2 vuzpr <0,4,1,5>, LHS + 2108203010U, // <4,5,1,4>: Cost 2 ins <4,5,u,4>, lane 2 + 2133573636U, // <4,5,1,5>: Cost 2 ins <u,u,1,5>, lane 4 + 2108219394U, // <4,5,1,6>: Cost 2 ins <4,5,u,6>, lane 2 + 1034485762U, // <4,5,1,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <4,5,1,u>: Cost 1 ins RHS, lane 2 + 2108170242U, // <4,5,2,0>: Cost 2 ins <4,5,u,0>, lane 2 + 2108178434U, // <4,5,2,1>: Cost 2 ins <4,5,u,1>, lane 2 + 2133622788U, // <4,5,2,2>: Cost 2 ins <u,u,2,2>, lane 4 + 1059889156U, // <4,5,2,3>: Cost 1 ins LHS, lane 4 + 2108203010U, // <4,5,2,4>: Cost 2 ins <4,5,u,4>, lane 2 + 2108211202U, // <4,5,2,5>: Cost 2 ins <4,5,u,5>, lane 2 + 2133655556U, // <4,5,2,6>: Cost 2 ins <u,u,2,6>, lane 4 + 1034485762U, // <4,5,2,7>: Cost 1 ins RHS, lane 2 + 1059889156U, // <4,5,2,u>: Cost 1 ins LHS, lane 4 + 2133680132U, // <4,5,3,0>: Cost 2 ins <u,u,3,0>, lane 4 + 2108178434U, // <4,5,3,1>: Cost 2 ins <4,5,u,1>, lane 2 + 2133696516U, // <4,5,3,2>: Cost 2 ins <u,u,3,2>, lane 4 + 2133704708U, // <4,5,3,3>: Cost 2 ins <u,u,3,3>, lane 4 + 2133712900U, // <4,5,3,4>: Cost 2 ins <u,u,3,4>, lane 4 + 2108211202U, // <4,5,3,5>: Cost 2 ins <4,5,u,5>, lane 2 + 2108219394U, // <4,5,3,6>: Cost 2 ins <4,5,u,6>, lane 2 + 1034485762U, // <4,5,3,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <4,5,3,u>: Cost 1 ins RHS, lane 2 + 2108170242U, // <4,5,4,0>: Cost 2 ins <4,5,u,0>, lane 2 + 2108178434U, // <4,5,4,1>: Cost 2 ins <4,5,u,1>, lane 2 + 2108186626U, // <4,5,4,2>: Cost 2 ins <4,5,u,2>, lane 2 + 2108194818U, // <4,5,4,3>: Cost 2 ins <4,5,u,3>, lane 2 + 2109898753U, // <4,5,4,4>: Cost 2 ins <4,u,4,4>, lane 1 + 1034493957U, // <4,5,4,5>: Cost 1 ins RHS, lane 5 + 1707298102U, // <4,5,4,6>: Cost 2 vuzpl <4,u,5,1>, RHS + 1034485762U, // <4,5,4,7>: Cost 1 ins RHS, lane 2 + 1034493957U, // <4,5,4,u>: Cost 1 ins RHS, lane 5 + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 1839656656U, // <4,5,5,1>: Cost 2 vzipl RHS, <5,1,7,3> + 2108186626U, // <4,5,5,2>: Cost 2 ins <4,5,u,2>, lane 2 + 2108194818U, // <4,5,5,3>: Cost 2 ins <4,5,u,3>, lane 2 + 1839648710U, // <4,5,5,4>: Cost 2 vzipl RHS, <5,4,7,6> + 1839648772U, // <4,5,5,5>: Cost 2 vzipl RHS, <5,5,5,5> + 1839648866U, // <4,5,5,6>: Cost 2 vzipl RHS, <5,6,7,0> + 1034485762U, // <4,5,5,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <4,5,5,u>: Cost 1 ins RHS, lane 2 + 1034346499U, // <4,5,6,0>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,5,6,1>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,5,6,2>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,5,6,3>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,5,6,4>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,5,6,5>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,5,6,6>: Cost 1 ins RHS, lane 3 + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2133975044U, // <4,5,7,0>: Cost 2 ins <u,u,7,0>, lane 4 + 2108178434U, // <4,5,7,1>: Cost 2 ins <4,5,u,1>, lane 2 + 2108186626U, // <4,5,7,2>: Cost 2 ins <4,5,u,2>, lane 2 + 2133999620U, // <4,5,7,3>: Cost 2 ins <u,u,7,3>, lane 4 + 2134007812U, // <4,5,7,4>: Cost 2 ins <u,u,7,4>, lane 4 + 2108211202U, // <4,5,7,5>: Cost 2 ins <4,5,u,5>, lane 2 + 2134024196U, // <4,5,7,6>: Cost 2 ins <u,u,7,6>, lane 4 + 1034485762U, // <4,5,7,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <4,5,7,u>: Cost 1 ins RHS, lane 2 + 1034346499U, // <4,5,u,0>: Cost 1 ins RHS, lane 3 + 1034493957U, // <4,5,u,1>: Cost 1 ins RHS, lane 5 + 1034346499U, // <4,5,u,2>: Cost 1 ins RHS, lane 3 + 1059889156U, // <4,5,u,3>: Cost 1 ins LHS, lane 4 + 1034346499U, // <4,5,u,4>: Cost 1 ins RHS, lane 3 + 1034493957U, // <4,5,u,5>: Cost 1 ins RHS, lane 5 + 1034346499U, // <4,5,u,6>: Cost 1 ins RHS, lane 3 + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 1705426944U, // <4,6,0,0>: Cost 2 vuzpl RHS, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 631685222U, // <4,6,0,2>: Cost 1 vuzpl RHS, LHS + 2108309507U, // <4,6,0,3>: Cost 2 ins <4,6,0,u>, lane 3 + 1705427148U, // <4,6,0,4>: Cost 2 vuzpl RHS, <0,2,4,6> + 2108309507U, // <4,6,0,5>: Cost 2 ins <4,6,0,u>, lane 3 + 2108882946U, // <4,6,0,6>: Cost 2 ins <4,6,u,6>, lane 2 + 2108309507U, // <4,6,0,7>: Cost 2 ins <4,6,0,u>, lane 3 + 631685276U, // <4,6,0,u>: Cost 1 vuzpl RHS, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 1705427764U, // <4,6,1,1>: Cost 2 vuzpl RHS, <1,1,1,1> + 2108850178U, // <4,6,1,2>: Cost 2 ins <4,6,u,2>, lane 2 + 1747681382U, // <4,6,1,3>: Cost 2 vuzpr <0,4,2,6>, LHS + 2779169619U, // <4,6,1,4>: Cost 3 vuzpl RHS, <1,1,4,5> + 1705427968U, // <4,6,1,5>: Cost 2 vuzpl RHS, <1,3,5,7> + 2108882946U, // <4,6,1,6>: Cost 2 ins <4,6,u,6>, lane 2 + 2109702145U, // <4,6,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 1747681387U, // <4,6,1,u>: Cost 2 vuzpr <0,4,2,6>, LHS + 1705428646U, // <4,6,2,0>: Cost 2 vuzpl RHS, <2,3,0,1> + 2779170237U, // <4,6,2,1>: Cost 3 vuzpl RHS, <2,0,1,2> + 1705428584U, // <4,6,2,2>: Cost 2 vuzpl RHS, <2,2,2,2> + 1705428594U, // <4,6,2,3>: Cost 2 vuzpl RHS, <2,2,3,3> + 1705428686U, // <4,6,2,4>: Cost 2 vuzpl RHS, <2,3,4,5> + 2839560386U, // <4,6,2,5>: Cost 3 vuzpr <3,4,5,6>, <0,2,3,5> + 2108882946U, // <4,6,2,6>: Cost 2 ins <4,6,u,6>, lane 2 + 2109775873U, // <4,6,2,7>: Cost 2 ins <4,u,2,7>, lane 1 + 1705428639U, // <4,6,2,u>: Cost 2 vuzpl RHS, <2,2,u,3> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 1705429142U, // <4,6,3,1>: Cost 2 vuzpl RHS, <3,0,1,2> + 2108850178U, // <4,6,3,2>: Cost 2 ins <4,6,u,2>, lane 2 + 1705429404U, // <4,6,3,3>: Cost 2 vuzpl RHS, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 1705429506U, // <4,6,3,5>: Cost 2 vuzpl RHS, <3,4,5,6> + 2108882946U, // <4,6,3,6>: Cost 2 ins <4,6,u,6>, lane 2 + 2132410368U, // <4,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0 + 1705429205U, // <4,6,3,u>: Cost 2 vuzpl RHS, <3,0,u,2> + 1705430348U, // <4,6,4,0>: Cost 2 vuzpl RHS, <4,6,0,2> + 2108604419U, // <4,6,4,1>: Cost 2 ins <4,6,4,u>, lane 3 + 2108850178U, // <4,6,4,2>: Cost 2 ins <4,6,u,2>, lane 2 + 2108604419U, // <4,6,4,3>: Cost 2 ins <4,6,4,u>, lane 3 + 1705430224U, // <4,6,4,4>: Cost 2 vuzpl RHS, <4,4,4,4> + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 631688502U, // <4,6,4,6>: Cost 1 vuzpl RHS, RHS + 2108604419U, // <4,6,4,7>: Cost 2 ins <4,6,4,u>, lane 3 + 631688520U, // <4,6,4,u>: Cost 1 vuzpl RHS, RHS + 2839563567U, // <4,6,5,0>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,0> + 1705439360U, // <4,6,5,1>: Cost 2 vuzpl RHS, <5,7,1,3> + 1839657466U, // <4,6,5,2>: Cost 2 vzipl RHS, <6,2,7,3> + 2839563570U, // <4,6,5,3>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,3> + 2839563571U, // <4,6,5,4>: Cost 3 vuzpr <3,4,5,6>, <4,5,6,4> + 1705431044U, // <4,6,5,5>: Cost 2 vuzpl RHS, <5,5,5,5> + 1839649592U, // <4,6,5,6>: Cost 2 vzipl RHS, <6,6,6,6> + 1747684662U, // <4,6,5,7>: Cost 2 vuzpr <0,4,2,6>, RHS + 1747684663U, // <4,6,5,u>: Cost 2 vuzpr <0,4,2,6>, RHS + 1705431886U, // <4,6,6,0>: Cost 2 vuzpl RHS, <6,7,0,1> + 2110021633U, // <4,6,6,1>: Cost 2 ins <4,u,6,1>, lane 1 + 2110029825U, // <4,6,6,2>: Cost 2 ins <4,u,6,2>, lane 1 + 2110038017U, // <4,6,6,3>: Cost 2 ins <4,u,6,3>, lane 1 + 1705431926U, // <4,6,6,4>: Cost 2 vuzpl RHS, <6,7,4,5> + 2110054401U, // <4,6,6,5>: Cost 2 ins <4,u,6,5>, lane 1 + 1705431864U, // <4,6,6,6>: Cost 2 vuzpl RHS, <6,6,6,6> + 1036328961U, // <4,6,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,6,6,u>: Cost 1 ins RHS, lane 1 + 2132647936U, // <4,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0 + 1705432058U, // <4,6,7,1>: Cost 2 vuzpl RHS, <7,0,1,2> + 2108850178U, // <4,6,7,2>: Cost 2 ins <4,6,u,2>, lane 2 + 2779173980U, // <4,6,7,3>: Cost 3 vuzpl RHS, <7,1,3,1> + 2132680704U, // <4,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0 + 1705432422U, // <4,6,7,5>: Cost 2 vuzpl RHS, <7,4,5,6> + 2108882946U, // <4,6,7,6>: Cost 2 ins <4,6,u,6>, lane 2 + 1705432684U, // <4,6,7,7>: Cost 2 vuzpl RHS, <7,7,7,7> + 1705432121U, // <4,6,7,u>: Cost 2 vuzpl RHS, <7,0,u,2> + 1705433020U, // <4,6,u,0>: Cost 2 vuzpl RHS, <u,3,0,1> + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 631691054U, // <4,6,u,2>: Cost 1 vuzpl RHS, LHS + 1747681949U, // <4,6,u,3>: Cost 2 vuzpr <0,4,2,6>, LHS + 1705433060U, // <4,6,u,4>: Cost 2 vuzpl RHS, <u,3,4,5> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 631691418U, // <4,6,u,6>: Cost 1 vuzpl RHS, RHS + 1036328961U, // <4,6,u,7>: Cost 1 ins RHS, lane 1 + 631691108U, // <4,6,u,u>: Cost 1 vuzpl RHS, LHS + 3206537216U, // <4,7,0,0>: Cost 3 ins <u,7,0,0>, lane 0 + 2132803584U, // <4,7,0,1>: Cost 2 ins <u,7,0,1>, lane 0 + 2109587457U, // <4,7,0,2>: Cost 2 ins <4,u,0,2>, lane 1 + 2845614101U, // <4,7,0,3>: Cost 3 vuzpr <4,4,6,7>, <0,0,2,3> + 3206569984U, // <4,7,0,4>: Cost 3 ins <u,7,0,4>, lane 0 + 3047789926U, // <4,7,0,5>: Cost 3 vtrnl <4,6,0,2>, <7,4,5,6> + 3047789929U, // <4,7,0,6>: Cost 3 vtrnl <4,6,0,2>, <7,4,6,0> + 2109628417U, // <4,7,0,7>: Cost 2 ins <4,u,0,7>, lane 1 + 2132803584U, // <4,7,0,u>: Cost 2 ins <u,7,0,1>, lane 0 + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3206619136U, // <4,7,1,1>: Cost 3 ins <u,7,1,1>, lane 0 + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 2132893696U, // <4,7,1,3>: Cost 2 ins <u,7,1,3>, lane 0 + 3206643712U, // <4,7,1,4>: Cost 3 ins <u,7,1,4>, lane 0 + 3206651904U, // <4,7,1,5>: Cost 3 ins <u,7,1,5>, lane 0 + 2988265414U, // <4,7,1,6>: Cost 3 vzipr <5,u,4,1>, <5,4,7,6> + 2109702145U, // <4,7,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 2132893696U, // <4,7,1,u>: Cost 2 ins <u,7,1,3>, lane 0 + 3206684672U, // <4,7,2,0>: Cost 3 ins <u,7,2,0>, lane 0 + 3206692864U, // <4,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0 + 3206701056U, // <4,7,2,2>: Cost 3 ins <u,7,2,2>, lane 0 + 2132967424U, // <4,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0 + 2833597338U, // <4,7,2,4>: Cost 3 vuzpr <2,4,5,7>, <1,2,3,4> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3206733824U, // <4,7,2,6>: Cost 3 ins <u,7,2,6>, lane 0 + 2109775873U, // <4,7,2,7>: Cost 2 ins <4,u,2,7>, lane 1 + 2132967424U, // <4,7,2,u>: Cost 2 ins <u,7,2,3>, lane 0 + 3206758400U, // <4,7,3,0>: Cost 3 ins <u,7,3,0>, lane 0 + 3206766592U, // <4,7,3,1>: Cost 3 ins <u,7,3,1>, lane 0 + 3047388245U, // <4,7,3,2>: Cost 3 vtrnl <4,5,3,7>, <7,1,2,3> + 3206782976U, // <4,7,3,3>: Cost 3 ins <u,7,3,3>, lane 0 + 2989609062U, // <4,7,3,4>: Cost 3 vzipr <6,1,4,3>, <5,6,7,4> + 3206799360U, // <4,7,3,5>: Cost 3 ins <u,7,3,5>, lane 0 + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 2109849601U, // <4,7,3,7>: Cost 2 ins <4,u,3,7>, lane 1 + 2109849601U, // <4,7,3,u>: Cost 2 ins <4,u,3,7>, lane 1 + 2583199846U, // <4,7,4,0>: Cost 3 vext1 <5,4,7,4>, LHS + 3048117242U, // <4,7,4,1>: Cost 3 vtrnl <4,6,4,6>, <7,0,1,2> + 3183624193U, // <4,7,4,2>: Cost 3 ins <4,u,4,2>, lane 1 + 2979659923U, // <4,7,4,3>: Cost 3 vzipr <4,4,4,4>, <0,1,7,3> + 2109898753U, // <4,7,4,4>: Cost 2 ins <4,u,4,4>, lane 1 + 2133131264U, // <4,7,4,5>: Cost 2 ins <u,7,4,5>, lane 0 + 2109915137U, // <4,7,4,6>: Cost 2 ins <4,u,4,6>, lane 1 + 1771875557U, // <4,7,4,7>: Cost 2 vuzpr <4,4,6,7>, <4,4,6,7> + 2133131264U, // <4,7,4,u>: Cost 2 ins <u,7,4,5>, lane 0 + 1839649786U, // <4,7,5,0>: Cost 2 vzipl RHS, <7,0,1,2> + 2109947905U, // <4,7,5,1>: Cost 2 ins <4,u,5,1>, lane 1 + 2913391781U, // <4,7,5,2>: Cost 3 vzipl RHS, <7,2,2,2> + 2913391843U, // <4,7,5,3>: Cost 3 vzipl RHS, <7,3,0,1> + 1839650150U, // <4,7,5,4>: Cost 2 vzipl RHS, <7,4,5,6> + 2109980673U, // <4,7,5,5>: Cost 2 ins <4,u,5,5>, lane 1 + 2913392145U, // <4,7,5,6>: Cost 3 vzipl RHS, <7,6,6,6> + 1839650412U, // <4,7,5,7>: Cost 2 vzipl RHS, <7,7,7,7> + 1839650434U, // <4,7,5,u>: Cost 2 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 1973867514U, // <4,7,6,1>: Cost 2 vtrnl RHS, <7,0,1,2> + 2110029825U, // <4,7,6,2>: Cost 2 ins <4,u,6,2>, lane 1 + 2110038017U, // <4,7,6,3>: Cost 2 ins <4,u,6,3>, lane 1 + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1973867878U, // <4,7,6,5>: Cost 2 vtrnl RHS, <7,4,5,6> + 2110062593U, // <4,7,6,6>: Cost 2 ins <4,u,6,6>, lane 1 + 1036328961U, // <4,7,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,7,6,u>: Cost 1 ins RHS, lane 1 + 2914587642U, // <4,7,7,0>: Cost 3 vzipl <4,7,5,0>, <7,0,1,2> + 2779862010U, // <4,7,7,1>: Cost 3 vuzpl <4,6,7,1>, <7,0,1,2> + 2779247701U, // <4,7,7,2>: Cost 3 vuzpl <4,5,7,7>, <7,1,2,3> + 3207077888U, // <4,7,7,3>: Cost 3 ins <u,7,7,3>, lane 0 + 2914620774U, // <4,7,7,4>: Cost 3 vzipl <4,7,5,4>, <7,4,5,6> + 2779895142U, // <4,7,7,5>: Cost 3 vuzpl <4,6,7,5>, <7,4,5,6> + 2992295878U, // <4,7,7,6>: Cost 3 vzipr <6,5,4,7>, <5,4,7,6> + 2133368832U, // <4,7,7,7>: Cost 2 ins <u,7,7,7>, lane 0 + 2133368832U, // <4,7,7,u>: Cost 2 ins <u,7,7,7>, lane 0 + 1841640442U, // <4,7,u,0>: Cost 2 vzipl RHS, <7,0,1,2> + 1974014970U, // <4,7,u,1>: Cost 2 vtrnl RHS, <7,0,1,2> + 2109587457U, // <4,7,u,2>: Cost 2 ins <4,u,0,2>, lane 1 + 2132893696U, // <4,7,u,3>: Cost 2 ins <u,7,1,3>, lane 0 + 1841640806U, // <4,7,u,4>: Cost 2 vzipl RHS, <7,4,5,6> + 1974015334U, // <4,7,u,5>: Cost 2 vtrnl RHS, <7,4,5,6> + 2109915137U, // <4,7,u,6>: Cost 2 ins <4,u,4,6>, lane 1 + 1036328961U, // <4,7,u,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <4,7,u,u>: Cost 1 ins RHS, lane 1 + 1705574400U, // <4,u,0,0>: Cost 2 vuzpl RHS, <0,0,0,0> + 1034493957U, // <4,u,0,1>: Cost 1 ins RHS, lane 5 + 631832678U, // <4,u,0,2>: Cost 1 vuzpl RHS, LHS + 2108309507U, // <4,u,0,3>: Cost 2 ins <4,6,0,u>, lane 3 + 1705574604U, // <4,u,0,4>: Cost 2 vuzpl RHS, <0,2,4,6> + 2107547650U, // <4,u,0,5>: Cost 2 ins <4,4,u,5>, lane 2 + 1974048922U, // <4,u,0,6>: Cost 2 vtrnl <4,6,0,2>, RHS + 1034485762U, // <4,u,0,7>: Cost 1 ins RHS, lane 2 + 631832732U, // <4,u,0,u>: Cost 1 vuzpl RHS, LHS + 2108170242U, // <4,u,1,0>: Cost 2 ins <4,5,u,0>, lane 2 + 1705575220U, // <4,u,1,1>: Cost 2 vuzpl RHS, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1747624038U, // <4,u,1,3>: Cost 2 vuzpr <0,4,1,u>, LHS + 2107539458U, // <4,u,1,4>: Cost 2 ins <4,4,u,4>, lane 2 + 1705575424U, // <4,u,1,5>: Cost 2 vuzpl RHS, <1,3,5,7> + 2107555842U, // <4,u,1,6>: Cost 2 ins <4,4,u,6>, lane 2 + 1034485762U, // <4,u,1,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <4,u,1,u>: Cost 1 ins RHS, lane 2 + 1705576102U, // <4,u,2,0>: Cost 2 vuzpl RHS, <2,3,0,1> + 2104860674U, // <4,u,2,1>: Cost 2 ins <4,0,u,1>, lane 2 + 1705576040U, // <4,u,2,2>: Cost 2 vuzpl RHS, <2,2,2,2> + 1055244288U, // <4,u,2,3>: Cost 1 ins LHS, lane 0 + 1705576142U, // <4,u,2,4>: Cost 2 vuzpl RHS, <2,3,4,5> + 2107547650U, // <4,u,2,5>: Cost 2 ins <4,4,u,5>, lane 2 + 2131001344U, // <4,u,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 1034485762U, // <4,u,2,7>: Cost 1 ins RHS, lane 2 + 1055244288U, // <4,u,2,u>: Cost 1 ins LHS, lane 0 + 2129698816U, // <4,u,3,0>: Cost 2 ins <u,2,3,0>, lane 0 + 1705576598U, // <4,u,3,1>: Cost 2 vuzpl RHS, <3,0,1,2> + 2128388096U, // <4,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 1705576860U, // <4,u,3,3>: Cost 2 vuzpl RHS, <3,3,3,3> + 2129731584U, // <4,u,3,4>: Cost 2 ins <u,2,3,4>, lane 0 + 1705576962U, // <4,u,3,5>: Cost 2 vuzpl RHS, <3,4,5,6> + 2107555842U, // <4,u,3,6>: Cost 2 ins <4,4,u,6>, lane 2 + 1034485762U, // <4,u,3,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <4,u,3,u>: Cost 1 ins RHS, lane 2 + 1705577804U, // <4,u,4,0>: Cost 2 vuzpl RHS, <4,6,0,2> + 2104860674U, // <4,u,4,1>: Cost 2 ins <4,0,u,1>, lane 2 + 1974376238U, // <4,u,4,2>: Cost 2 vtrnl <4,6,4,6>, LHS + 2108604419U, // <4,u,4,3>: Cost 2 ins <4,6,4,u>, lane 3 + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1034493957U, // <4,u,4,5>: Cost 1 ins RHS, lane 5 + 631835958U, // <4,u,4,6>: Cost 1 vuzpl RHS, RHS + 1034485762U, // <4,u,4,7>: Cost 1 ins RHS, lane 2 + 631835976U, // <4,u,4,u>: Cost 1 vuzpl RHS, RHS + 1839650515U, // <4,u,5,0>: Cost 2 vzipl RHS, <u,0,1,2> + 765908782U, // <4,u,5,1>: Cost 1 vzipl RHS, LHS + 1839650693U, // <4,u,5,2>: Cost 2 vzipl RHS, <u,2,3,0> + 2016035485U, // <4,u,5,3>: Cost 2 vtrnr <0,4,1,5>, LHS + 1839650879U, // <4,u,5,4>: Cost 2 vzipl RHS, <u,4,5,6> + 765909146U, // <4,u,5,5>: Cost 1 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 1034485762U, // <4,u,5,7>: Cost 1 ins RHS, lane 2 + 765909349U, // <4,u,5,u>: Cost 1 vzipl RHS, LHS + 1034346499U, // <4,u,6,0>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,u,6,1>: Cost 1 ins RHS, lane 3 + 900126510U, // <4,u,6,2>: Cost 1 vtrnl RHS, LHS + 1034346499U, // <4,u,6,3>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,u,6,4>: Cost 1 ins RHS, lane 3 + 1034346499U, // <4,u,6,5>: Cost 1 ins RHS, lane 3 + 900126874U, // <4,u,6,6>: Cost 1 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2133975044U, // <4,u,7,0>: Cost 2 ins <u,u,7,0>, lane 4 + 1705579514U, // <4,u,7,1>: Cost 2 vuzpl RHS, <7,0,1,2> + 2104868866U, // <4,u,7,2>: Cost 2 ins <4,0,u,2>, lane 2 + 2129354752U, // <4,u,7,3>: Cost 2 ins <u,1,7,3>, lane 0 + 2134007812U, // <4,u,7,4>: Cost 2 ins <u,u,7,4>, lane 4 + 1705579878U, // <4,u,7,5>: Cost 2 vuzpl RHS, <7,4,5,6> + 2131369984U, // <4,u,7,6>: Cost 2 ins <u,4,7,6>, lane 0 + 1034485762U, // <4,u,7,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <4,u,7,u>: Cost 1 ins RHS, lane 2 + 1034346499U, // <4,u,u,0>: Cost 1 ins RHS, lane 3 + 767899438U, // <4,u,u,1>: Cost 1 vzipl RHS, LHS + 631838510U, // <4,u,u,2>: Cost 1 vuzpl RHS, LHS + 1055244288U, // <4,u,u,3>: Cost 1 ins LHS, lane 0 + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 767899802U, // <4,u,u,5>: Cost 1 vzipl RHS, RHS + 631838874U, // <4,u,u,6>: Cost 1 vuzpl RHS, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2128150528U, // <5,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0 + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 2846220309U, // <5,0,0,3>: Cost 3 vuzpr <4,5,6,0>, <0,0,2,3> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 2583318482U, // <5,0,0,5>: Cost 3 vext1 <5,5,0,0>, <5,5,0,0> + 3189334017U, // <5,0,0,6>: Cost 3 ins <5,u,0,6>, lane 1 + 2846223265U, // <5,0,0,7>: Cost 3 vuzpr <4,5,6,0>, <4,0,6,7> + 2128150528U, // <5,0,0,u>: Cost 2 ins <u,0,0,0>, lane 0 + 1503608934U, // <5,0,1,0>: Cost 2 vext1 <4,5,0,1>, LHS + 1843003494U, // <5,0,1,1>: Cost 2 vzipl <5,1,7,3>, LHS + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2115641345U, // <5,0,1,3>: Cost 2 ins <5,u,1,3>, lane 1 + 1611612282U, // <5,0,1,4>: Cost 2 vext3 <0,1,4,5>, <0,1,4,5> + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3202015232U, // <5,0,1,6>: Cost 3 ins <u,0,1,6>, lane 0 + 3189415937U, // <5,0,1,7>: Cost 3 ins <5,u,1,7>, lane 1 + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2128314368U, // <5,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0 + 2128322560U, // <5,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0 + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 3189481473U, // <5,0,2,6>: Cost 3 ins <5,u,2,6>, lane 1 + 2595280262U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,5,0,2> + 2128314368U, // <5,0,2,u>: Cost 2 ins <u,0,2,2>, lane 0 + 3202113536U, // <5,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0 + 2918047846U, // <5,0,3,1>: Cost 3 vzipl <5,3,7,0>, LHS + 2128388096U, // <5,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 3189530625U, // <5,0,3,3>: Cost 3 ins <5,u,3,3>, lane 1 + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 2785315330U, // <5,0,3,5>: Cost 3 vuzpl <5,6,0,1>, <3,4,5,6> + 3202162688U, // <5,0,3,6>: Cost 3 ins <u,0,3,6>, lane 0 + 2840323072U, // <5,0,3,7>: Cost 3 vuzpr <3,5,7,0>, <1,3,5,7> + 2128388096U, // <5,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0 + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3184336899U, // <5,0,4,3>: Cost 3 ins <5,0,4,u>, lane 3 + 2687345005U, // <5,0,4,4>: Cost 3 vext3 <0,4,4,5>, <0,4,4,5> + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2846222850U, // <5,0,4,6>: Cost 3 vuzpr <4,5,6,0>, <3,4,5,6> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1845019293U, // <5,0,4,u>: Cost 2 vzipl <5,4,7,6>, LHS + 1772481839U, // <5,0,5,0>: Cost 2 vuzpr <4,5,6,0>, <4,5,6,0> + 1845526630U, // <5,0,5,1>: Cost 2 vzipl <5,5,5,5>, LHS + 1979744358U, // <5,0,5,2>: Cost 2 vtrnl <5,5,5,5>, LHS + 3189678081U, // <5,0,5,3>: Cost 3 ins <5,u,5,3>, lane 1 + 2919268690U, // <5,0,5,4>: Cost 3 vzipl <5,5,5,5>, <0,4,1,5> + 2115952641U, // <5,0,5,5>: Cost 2 ins <5,u,5,5>, lane 1 + 3202310144U, // <5,0,5,6>: Cost 3 ins <u,0,5,6>, lane 0 + 2115969025U, // <5,0,5,7>: Cost 2 ins <5,u,5,7>, lane 1 + 1845527197U, // <5,0,5,u>: Cost 2 vzipl <5,5,5,5>, LHS + 2973777920U, // <5,0,6,0>: Cost 3 vzipr <3,4,5,6>, <0,0,0,0> + 1846296678U, // <5,0,6,1>: Cost 2 vzipl <5,6,7,0>, LHS + 2128609280U, // <5,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 3189751809U, // <5,0,6,3>: Cost 3 ins <5,u,6,3>, lane 1 + 2920038738U, // <5,0,6,4>: Cost 3 vzipl <5,6,7,0>, <0,4,1,5> + 2920038866U, // <5,0,6,5>: Cost 3 vzipl <5,6,7,0>, <0,5,6,7> + 3189776385U, // <5,0,6,6>: Cost 3 ins <5,u,6,6>, lane 1 + 2128650240U, // <5,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0 + 1846297245U, // <5,0,6,u>: Cost 2 vzipl <5,6,7,0>, LHS + 2040971264U, // <5,0,7,0>: Cost 2 vtrnr RHS, <0,0,0,0> + 2040971274U, // <5,0,7,1>: Cost 2 vtrnr RHS, <0,0,1,1> + 2040971284U, // <5,0,7,2>: Cost 2 vtrnr RHS, <0,0,2,2> + 2116083713U, // <5,0,7,3>: Cost 2 ins <5,u,7,3>, lane 1 + 2116091905U, // <5,0,7,4>: Cost 2 ins <5,u,7,4>, lane 1 + 3114715316U, // <5,0,7,5>: Cost 3 vtrnr RHS, <3,0,4,5> + 2116108289U, // <5,0,7,6>: Cost 2 ins <5,u,7,6>, lane 1 + 2116116481U, // <5,0,7,7>: Cost 2 ins <5,u,7,7>, lane 1 + 2040971281U, // <5,0,7,u>: Cost 2 vtrnr RHS, <0,0,1,u> + 2040979456U, // <5,0,u,0>: Cost 2 vtrnr RHS, <0,0,0,0> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2115641345U, // <5,0,u,3>: Cost 2 ins <5,u,1,3>, lane 1 + 2116091905U, // <5,0,u,4>: Cost 2 ins <5,u,7,4>, lane 1 + 2115952641U, // <5,0,u,5>: Cost 2 ins <5,u,5,5>, lane 1 + 2116108289U, // <5,0,u,6>: Cost 2 ins <5,u,7,6>, lane 1 + 2115969025U, // <5,0,u,7>: Cost 2 ins <5,u,5,7>, lane 1 + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 1712324710U, // <5,1,0,2>: Cost 2 vuzpl <5,7,1,3>, LHS + 2111512578U, // <5,1,0,3>: Cost 2 ins <5,1,u,3>, lane 2 + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2977710418U, // <5,1,0,5>: Cost 3 vzipr <4,1,5,0>, <0,4,1,5> + 3185278978U, // <5,1,0,6>: Cost 3 ins <5,1,u,6>, lane 2 + 3184705539U, // <5,1,0,7>: Cost 3 ins <5,1,0,u>, lane 3 + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2128896000U, // <5,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0 + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2115641345U, // <5,1,1,3>: Cost 2 ins <5,u,1,3>, lane 1 + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3189407745U, // <5,1,1,6>: Cost 3 ins <5,u,1,6>, lane 1 + 2982367283U, // <5,1,1,7>: Cost 3 vzipr <4,u,5,1>, <5,6,1,7> + 2115641345U, // <5,1,1,u>: Cost 2 ins <5,u,1,3>, lane 1 + 2128961536U, // <5,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0 + 2128969728U, // <5,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 2128977920U, // <5,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0 + 1055244288U, // <5,1,2,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <5,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0 + 2129002496U, // <5,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2129010688U, // <5,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2129018880U, // <5,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0 + 1055244288U, // <5,1,2,u>: Cost 1 ins LHS, lane 0 + 2571468902U, // <5,1,3,0>: Cost 3 vext1 <3,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 2571470542U, // <5,1,3,2>: Cost 3 vext1 <3,5,1,3>, <2,3,4,5> + 2129059840U, // <5,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0 + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 2595361654U, // <5,1,3,6>: Cost 3 vext1 <7,5,1,3>, <6,7,4,5> + 2840331264U, // <5,1,3,7>: Cost 3 vuzpr <3,5,7,1>, <1,3,5,7> + 2129059840U, // <5,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0 + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2111512578U, // <5,1,4,3>: Cost 2 ins <5,1,u,3>, lane 2 + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 1712327990U, // <5,1,4,6>: Cost 2 vuzpl <5,7,1,3>, RHS + 3185000451U, // <5,1,4,7>: Cost 3 ins <5,1,4,u>, lane 3 + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 1712328832U, // <5,1,5,1>: Cost 2 vuzpl <5,7,1,3>, <5,7,1,3> + 2982398102U, // <5,1,5,2>: Cost 3 vzipr <4,u,5,5>, <3,0,1,2> + 2046853222U, // <5,1,5,3>: Cost 2 vtrnr <5,5,5,5>, LHS + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2115952641U, // <5,1,5,5>: Cost 2 ins <5,u,5,5>, lane 1 + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2115969025U, // <5,1,5,7>: Cost 2 ins <5,u,5,7>, lane 1 + 2046853227U, // <5,1,5,u>: Cost 2 vtrnr <5,5,5,5>, LHS + 2920039158U, // <5,1,6,0>: Cost 3 vzipl <5,6,7,0>, <1,0,3,2> + 2961834642U, // <5,1,6,1>: Cost 3 vzipr <1,4,5,6>, <0,u,1,1> + 2973780118U, // <5,1,6,2>: Cost 3 vzipr <3,4,5,6>, <3,0,1,2> + 2111512578U, // <5,1,6,3>: Cost 2 ins <5,1,u,3>, lane 2 + 2224227480U, // <5,1,6,4>: Cost 3 vrev <1,5,4,6> + 2973778258U, // <5,1,6,5>: Cost 3 vzipr <3,4,5,6>, <0,4,1,5> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2111553541U, // <5,1,6,7>: Cost 2 ins <5,1,u,u>, lane 5 + 2111512578U, // <5,1,6,u>: Cost 2 ins <5,1,u,3>, lane 2 + 2116059137U, // <5,1,7,0>: Cost 2 ins <5,u,7,0>, lane 1 + 2040972084U, // <5,1,7,1>: Cost 2 vtrnr RHS, <1,1,1,1> + 2111479811U, // <5,1,7,2>: Cost 2 ins <5,1,7,u>, lane 3 + 967229542U, // <5,1,7,3>: Cost 1 vtrnr RHS, LHS + 2116091905U, // <5,1,7,4>: Cost 2 ins <5,u,7,4>, lane 1 + 2111479811U, // <5,1,7,5>: Cost 2 ins <5,1,7,u>, lane 3 + 2116108289U, // <5,1,7,6>: Cost 2 ins <5,u,7,6>, lane 1 + 2116116481U, // <5,1,7,7>: Cost 2 ins <5,u,7,7>, lane 1 + 967229547U, // <5,1,7,u>: Cost 1 vtrnr RHS, LHS + 2116059137U, // <5,1,u,0>: Cost 2 ins <5,u,7,0>, lane 1 + 2040980276U, // <5,1,u,1>: Cost 2 vtrnr RHS, <1,1,1,1> + 1712330542U, // <5,1,u,2>: Cost 2 vuzpl <5,7,1,3>, LHS + 967237734U, // <5,1,u,3>: Cost 1 vtrnr RHS, LHS + 2116091905U, // <5,1,u,4>: Cost 2 ins <5,u,7,4>, lane 1 + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 1712330906U, // <5,1,u,6>: Cost 2 vuzpl <5,7,1,3>, RHS + 2115969025U, // <5,1,u,7>: Cost 2 ins <5,u,5,7>, lane 1 + 967237739U, // <5,1,u,u>: Cost 1 vtrnr RHS, LHS + 2786132132U, // <5,2,0,0>: Cost 3 vuzpl <5,7,2,2>, <0,2,0,2> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2129494016U, // <5,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0 + 2973728870U, // <5,2,0,3>: Cost 3 vzipr <3,4,5,0>, LHS + 2786164940U, // <5,2,0,4>: Cost 3 vuzpl <5,7,2,6>, <0,2,4,6> + 2782158977U, // <5,2,0,5>: Cost 3 vuzpl <5,1,2,3>, <0,1,5,3> + 3185942530U, // <5,2,0,6>: Cost 3 ins <5,2,u,6>, lane 2 + 3114658883U, // <5,2,0,7>: Cost 3 vtrnr <4,5,6,0>, <4,2,6,7> + 2129494016U, // <5,2,0,u>: Cost 2 ins <u,2,0,2>, lane 0 + 3054503590U, // <5,2,1,0>: Cost 3 vtrnl <5,7,1,3>, <2,3,0,1> + 3203301376U, // <5,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0 + 2982363156U, // <5,2,1,2>: Cost 3 vzipr <4,u,5,1>, <0,0,2,2> + 1908621414U, // <5,2,1,3>: Cost 2 vzipr <4,u,5,1>, LHS + 3054503630U, // <5,2,1,4>: Cost 3 vtrnl <5,7,1,3>, <2,3,4,5> + 2601390208U, // <5,2,1,5>: Cost 3 vext1 <u,5,2,1>, <5,7,1,3> + 2982363484U, // <5,2,1,6>: Cost 3 vzipr <4,u,5,1>, <0,4,2,6> + 3189415937U, // <5,2,1,7>: Cost 3 ins <5,u,1,7>, lane 1 + 1908621419U, // <5,2,1,u>: Cost 2 vzipr <4,u,5,1>, LHS + 3203366912U, // <5,2,2,0>: Cost 3 ins <u,2,2,0>, lane 0 + 3203375104U, // <5,2,2,1>: Cost 3 ins <u,2,2,1>, lane 0 + 2129641472U, // <5,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0 + 2129649664U, // <5,2,2,3>: Cost 2 ins <u,2,2,3>, lane 0 + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 2698036870U, // <5,2,2,5>: Cost 3 vext3 <2,2,5,5>, <2,2,5,5> + 3189481473U, // <5,2,2,6>: Cost 3 ins <5,u,2,6>, lane 1 + 2846239811U, // <5,2,2,7>: Cost 3 vuzpr <4,5,6,2>, <4,2,6,7> + 2129641472U, // <5,2,2,u>: Cost 2 ins <u,2,2,2>, lane 0 + 2129698816U, // <5,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0 + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 2129723392U, // <5,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0 + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2717943511U, // <5,2,3,5>: Cost 3 vext3 <5,5,5,5>, <2,3,5,5> + 3203489792U, // <5,2,3,6>: Cost 3 ins <u,2,3,6>, lane 0 + 2827879424U, // <5,2,3,7>: Cost 3 vuzpr <1,5,0,2>, <1,3,5,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 3203514368U, // <5,2,4,0>: Cost 3 ins <u,2,4,0>, lane 0 + 3189587969U, // <5,2,4,1>: Cost 3 ins <5,u,4,1>, lane 1 + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 3203547136U, // <5,2,4,4>: Cost 3 ins <u,2,4,4>, lane 0 + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2129821696U, // <5,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0 + 2846239973U, // <5,2,4,7>: Cost 3 vuzpr <4,5,6,2>, <4,4,6,7> + 2129821696U, // <5,2,4,u>: Cost 2 ins <u,2,4,6>, lane 0 + 3053487782U, // <5,2,5,0>: Cost 3 vtrnl <5,5,5,5>, <2,3,0,1> + 3203596288U, // <5,2,5,1>: Cost 3 ins <u,2,5,1>, lane 0 + 1772498225U, // <5,2,5,2>: Cost 2 vuzpr <4,5,6,2>, <4,5,6,2> + 1908654182U, // <5,2,5,3>: Cost 2 vzipr <4,u,5,5>, LHS + 3053487822U, // <5,2,5,4>: Cost 3 vtrnl <5,5,5,5>, <2,3,4,5> + 2115952641U, // <5,2,5,5>: Cost 2 ins <5,u,5,5>, lane 1 + 2982396252U, // <5,2,5,6>: Cost 3 vzipr <4,u,5,5>, <0,4,2,6> + 2115969025U, // <5,2,5,7>: Cost 2 ins <5,u,5,7>, lane 1 + 1908654187U, // <5,2,5,u>: Cost 2 vzipr <4,u,5,5>, LHS + 3203661824U, // <5,2,6,0>: Cost 3 ins <u,2,6,0>, lane 0 + 3189735425U, // <5,2,6,1>: Cost 3 ins <5,u,6,1>, lane 1 + 2973777940U, // <5,2,6,2>: Cost 3 vzipr <3,4,5,6>, <0,0,2,2> + 1900036198U, // <5,2,6,3>: Cost 2 vzipr <3,4,5,6>, LHS + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 2973778186U, // <5,2,6,5>: Cost 3 vzipr <3,4,5,6>, <0,3,2,5> + 2973778268U, // <5,2,6,6>: Cost 3 vzipr <3,4,5,6>, <0,4,2,6> + 2129977344U, // <5,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0 + 1900036203U, // <5,2,6,u>: Cost 2 vzipr <3,4,5,6>, LHS + 2040972182U, // <5,2,7,0>: Cost 2 vtrnr RHS, <1,2,3,0> + 3114713251U, // <5,2,7,1>: Cost 3 vtrnr RHS, <0,2,0,1> + 2040971428U, // <5,2,7,2>: Cost 2 vtrnr RHS, <0,2,0,2> + 1887436902U, // <5,2,7,3>: Cost 2 vzipr <1,3,5,7>, LHS + 2040972186U, // <5,2,7,4>: Cost 2 vtrnr RHS, <1,2,3,4> + 2961178728U, // <5,2,7,5>: Cost 3 vzipr <1,3,5,7>, <0,1,2,5> + 2040971468U, // <5,2,7,6>: Cost 2 vtrnr RHS, <0,2,4,6> + 2116116481U, // <5,2,7,7>: Cost 2 ins <5,u,7,7>, lane 1 + 1887436907U, // <5,2,7,u>: Cost 2 vzipr <1,3,5,7>, LHS + 2040980374U, // <5,2,u,0>: Cost 2 vtrnr RHS, <1,2,3,0> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2040979620U, // <5,2,u,2>: Cost 2 vtrnr RHS, <0,2,0,2> + 1887445094U, // <5,2,u,3>: Cost 2 vzipr <1,3,5,u>, LHS + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2115952641U, // <5,2,u,5>: Cost 2 ins <5,u,5,5>, lane 1 + 2040979660U, // <5,2,u,6>: Cost 2 vtrnr RHS, <0,2,4,6> + 2115969025U, // <5,2,u,7>: Cost 2 ins <5,u,5,7>, lane 1 + 1887445099U, // <5,2,u,u>: Cost 2 vzipr <1,3,5,u>, LHS + 3203883008U, // <5,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0 + 2130149376U, // <5,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0 + 2782904422U, // <5,3,0,2>: Cost 3 vuzpl <5,2,3,4>, LHS + 3186581506U, // <5,3,0,3>: Cost 3 ins <5,3,u,3>, lane 2 + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3053750786U, // <5,3,0,5>: Cost 3 vtrnl <5,6,0,1>, <3,4,5,6> + 2618302971U, // <5,3,0,6>: Cost 3 vext2 <0,1,5,3>, <0,6,2,3> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2130149376U, // <5,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0 + 2982364054U, // <5,3,1,0>: Cost 3 vzipr <4,u,5,1>, <1,2,3,0> + 3054504086U, // <5,3,1,1>: Cost 3 vtrnl <5,7,1,3>, <3,0,1,2> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2130239488U, // <5,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0 + 2982364058U, // <5,3,1,4>: Cost 3 vzipr <4,u,5,1>, <1,2,3,4> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3189407745U, // <5,3,1,6>: Cost 3 ins <5,u,1,6>, lane 1 + 2964448400U, // <5,3,1,7>: Cost 3 vzipr <1,u,5,1>, <1,5,3,7> + 2130239488U, // <5,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0 + 2235845154U, // <5,3,2,0>: Cost 3 vrev <3,5,0,2> + 3204038656U, // <5,3,2,1>: Cost 3 ins <u,3,2,1>, lane 0 + 3204046848U, // <5,3,2,2>: Cost 3 ins <u,3,2,2>, lane 0 + 2130313216U, // <5,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0 + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3204079616U, // <5,3,2,6>: Cost 3 ins <u,3,2,6>, lane 0 + 3096314880U, // <5,3,2,7>: Cost 3 vtrnr <1,5,0,2>, <1,3,5,7> + 2130313216U, // <5,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0 + 3204104192U, // <5,3,3,0>: Cost 3 ins <u,3,3,0>, lane 0 + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3204120576U, // <5,3,3,2>: Cost 3 ins <u,3,3,2>, lane 0 + 2130386944U, // <5,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0 + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3189555201U, // <5,3,3,6>: Cost 3 ins <5,u,3,6>, lane 1 + 2971763856U, // <5,3,3,7>: Cost 3 vzipr <3,1,5,3>, <1,5,3,7> + 2130386944U, // <5,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0 + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 2642193381U, // <5,3,4,1>: Cost 3 vext2 <4,1,5,3>, <4,1,5,3> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2130477056U, // <5,3,4,5>: Cost 2 ins <u,3,4,5>, lane 0 + 2846247426U, // <5,3,4,6>: Cost 3 vuzpr <4,5,6,3>, <3,4,5,6> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2130477056U, // <5,3,4,u>: Cost 2 ins <u,3,4,5>, lane 0 + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 3053488278U, // <5,3,5,1>: Cost 3 vtrnl <5,5,5,5>, <3,0,1,2> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 1748320682U, // <5,3,5,3>: Cost 2 vuzpr <0,5,2,3>, <0,5,2,3> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2115952641U, // <5,3,5,5>: Cost 2 ins <5,u,5,5>, lane 1 + 3204300800U, // <5,3,5,6>: Cost 3 ins <u,3,5,6>, lane 0 + 2130567168U, // <5,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0 + 2130567168U, // <5,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0 + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3204333568U, // <5,3,6,1>: Cost 3 ins <u,3,6,1>, lane 0 + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 2973778114U, // <5,3,6,5>: Cost 3 vzipr <3,4,5,6>, <0,2,3,5> + 2973779816U, // <5,3,6,6>: Cost 3 vzipr <3,4,5,6>, <2,5,3,6> + 2130640896U, // <5,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0 + 2130640896U, // <5,3,6,u>: Cost 2 ins <u,3,6,7>, lane 0 + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2961179382U, // <5,3,7,2>: Cost 3 vzipr <1,3,5,7>, <1,0,3,2> + 2040972248U, // <5,3,7,3>: Cost 2 vtrnr RHS, <1,3,1,3> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2040973006U, // <5,3,7,5>: Cost 2 vtrnr RHS, <2,3,4,5> + 2116108289U, // <5,3,7,6>: Cost 2 ins <5,u,7,6>, lane 1 + 2040972288U, // <5,3,7,7>: Cost 2 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2961187574U, // <5,3,u,2>: Cost 3 vzipr <1,3,5,u>, <1,0,3,2> + 2040980440U, // <5,3,u,3>: Cost 2 vtrnr RHS, <1,3,1,3> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2040981198U, // <5,3,u,5>: Cost 2 vtrnr RHS, <2,3,4,5> + 2116108289U, // <5,3,u,6>: Cost 2 ins <5,u,7,6>, lane 1 + 2040980480U, // <5,3,u,7>: Cost 2 vtrnr RHS, <1,3,5,7> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3189284865U, // <5,4,0,0>: Cost 3 ins <5,u,0,0>, lane 1 + 2113544197U, // <5,4,0,1>: Cost 2 ins <5,4,u,u>, lane 5 + 2781626470U, // <5,4,0,2>: Cost 3 vuzpl <5,0,4,1>, LHS + 2242022676U, // <5,4,0,3>: Cost 3 vrev <4,5,3,0> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2113527810U, // <5,4,0,6>: Cost 2 ins <5,4,u,6>, lane 2 + 3114659045U, // <5,4,0,7>: Cost 3 vtrnr <4,5,6,0>, <4,4,6,7> + 2113544197U, // <5,4,0,u>: Cost 2 ins <5,4,u,u>, lane 5 + 1168067834U, // <5,4,1,0>: Cost 2 vrev <4,5,0,1> + 3189366785U, // <5,4,1,1>: Cost 3 ins <5,u,1,1>, lane 1 + 3204636672U, // <5,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0 + 2115641345U, // <5,4,1,3>: Cost 2 ins <5,u,1,3>, lane 1 + 2982366416U, // <5,4,1,4>: Cost 3 vzipr <4,u,5,1>, <4,4,4,4> + 1843006774U, // <5,4,1,5>: Cost 2 vzipl <5,1,7,3>, RHS + 1980763446U, // <5,4,1,6>: Cost 2 vtrnl <5,7,1,3>, RHS + 3189415937U, // <5,4,1,7>: Cost 3 ins <5,u,1,7>, lane 1 + 1843007017U, // <5,4,1,u>: Cost 2 vzipl <5,1,7,3>, RHS + 3204694016U, // <5,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0 + 2241891588U, // <5,4,2,1>: Cost 3 vrev <4,5,1,2> + 3189448705U, // <5,4,2,2>: Cost 3 ins <5,u,2,2>, lane 1 + 2113544197U, // <5,4,2,3>: Cost 2 ins <5,4,u,u>, lane 5 + 3204726784U, // <5,4,2,4>: Cost 3 ins <u,4,2,4>, lane 0 + 2973746894U, // <5,4,2,5>: Cost 3 vzipr <3,4,5,2>, <2,3,4,5> + 2131001344U, // <5,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 3114675429U, // <5,4,2,7>: Cost 3 vtrnr <4,5,6,2>, <4,4,6,7> + 2113544197U, // <5,4,2,u>: Cost 2 ins <5,4,u,u>, lane 5 + 3204767744U, // <5,4,3,0>: Cost 3 ins <u,4,3,0>, lane 0 + 2241899781U, // <5,4,3,1>: Cost 3 vrev <4,5,1,3> + 1168231694U, // <5,4,3,2>: Cost 2 vrev <4,5,2,3> + 3189530625U, // <5,4,3,3>: Cost 3 ins <5,u,3,3>, lane 1 + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 2978399950U, // <5,4,3,5>: Cost 3 vzipr <4,2,5,3>, <2,3,4,5> + 2113527810U, // <5,4,3,6>: Cost 2 ins <5,4,u,6>, lane 2 + 2840355840U, // <5,4,3,7>: Cost 3 vuzpr <3,5,7,4>, <1,3,5,7> + 2113527810U, // <5,4,3,u>: Cost 2 ins <5,4,u,6>, lane 2 + 2918763410U, // <5,4,4,0>: Cost 3 vzipl <5,4,7,6>, <4,0,5,1> + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3186991107U, // <5,4,4,2>: Cost 3 ins <5,4,4,u>, lane 3 + 3186991107U, // <5,4,4,3>: Cost 3 ins <5,4,4,u>, lane 3 + 2131132416U, // <5,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0 + 1845022006U, // <5,4,4,5>: Cost 2 vzipl <5,4,7,6>, RHS + 2113527810U, // <5,4,4,6>: Cost 2 ins <5,4,u,6>, lane 2 + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1845022249U, // <5,4,4,u>: Cost 2 vzipl <5,4,7,6>, RHS + 1503936614U, // <5,4,5,0>: Cost 2 vext1 <4,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3189678081U, // <5,4,5,3>: Cost 3 ins <5,u,5,3>, lane 1 + 1168395554U, // <5,4,5,4>: Cost 2 vrev <4,5,4,5> + 1845529910U, // <5,4,5,5>: Cost 2 vzipl <5,5,5,5>, RHS + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 2115969025U, // <5,4,5,7>: Cost 2 ins <5,u,5,7>, lane 1 + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771800U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,5,4,6> + 3189743617U, // <5,4,6,2>: Cost 3 ins <5,u,6,2>, lane 1 + 2571717194U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,5,4,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 1846299958U, // <5,4,6,5>: Cost 2 vzipl <5,6,7,0>, RHS + 2131296256U, // <5,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0 + 2113544197U, // <5,4,6,7>: Cost 2 ins <5,4,u,u>, lane 5 + 1846300201U, // <5,4,6,u>: Cost 2 vzipl <5,6,7,0>, RHS + 2116059137U, // <5,4,7,0>: Cost 2 ins <5,u,7,0>, lane 1 + 2113470467U, // <5,4,7,1>: Cost 2 ins <5,4,7,u>, lane 3 + 2113470467U, // <5,4,7,2>: Cost 2 ins <5,4,7,u>, lane 3 + 2116083713U, // <5,4,7,3>: Cost 2 ins <5,u,7,3>, lane 1 + 2040974544U, // <5,4,7,4>: Cost 2 vtrnr RHS, <4,4,4,4> + 2040971602U, // <5,4,7,5>: Cost 2 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2116116481U, // <5,4,7,7>: Cost 2 ins <5,u,7,7>, lane 1 + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2116059137U, // <5,4,u,0>: Cost 2 ins <5,u,7,0>, lane 1 + 2113544197U, // <5,4,u,1>: Cost 2 ins <5,4,u,u>, lane 5 + 2113470467U, // <5,4,u,2>: Cost 2 ins <5,4,7,u>, lane 3 + 2115641345U, // <5,4,u,3>: Cost 2 ins <5,u,1,3>, lane 1 + 2040982736U, // <5,4,u,4>: Cost 2 vtrnr RHS, <4,4,4,4> + 2040979794U, // <5,4,u,5>: Cost 2 vtrnr RHS, <0,4,1,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2115969025U, // <5,4,u,7>: Cost 2 ins <5,u,5,7>, lane 1 + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2040917295U, // <5,5,0,0>: Cost 2 vtrnr <4,5,6,0>, <4,5,6,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 1711308902U, // <5,5,0,2>: Cost 2 vuzpl <5,5,5,5>, LHS + 3187908610U, // <5,5,0,3>: Cost 3 ins <5,5,u,3>, lane 2 + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2114183170U, // <5,5,0,5>: Cost 2 ins <5,5,u,5>, lane 2 + 3187933186U, // <5,5,0,6>: Cost 3 ins <5,5,u,6>, lane 2 + 2114199554U, // <5,5,0,7>: Cost 2 ins <5,5,u,7>, lane 2 + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 1908624922U, // <5,5,1,1>: Cost 2 vzipr <4,u,5,1>, <4,u,5,1> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 1778417766U, // <5,5,1,3>: Cost 2 vuzpr <5,5,5,5>, LHS + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2114183170U, // <5,5,1,5>: Cost 2 ins <5,5,u,5>, lane 2 + 2982365698U, // <5,5,1,6>: Cost 3 vzipr <4,u,5,1>, <3,4,5,6> + 2114199554U, // <5,5,1,7>: Cost 2 ins <5,5,u,7>, lane 2 + 1778417771U, // <5,5,1,u>: Cost 2 vuzpr <5,5,5,5>, LHS + 2785052326U, // <5,5,2,0>: Cost 3 vuzpl <5,5,5,5>, <2,3,0,1> + 3205365760U, // <5,5,2,1>: Cost 3 ins <u,5,2,1>, lane 0 + 2040933681U, // <5,5,2,2>: Cost 2 vtrnr <4,5,6,2>, <4,5,6,2> + 2114207749U, // <5,5,2,3>: Cost 2 ins <5,5,u,u>, lane 5 + 2785052366U, // <5,5,2,4>: Cost 3 vuzpl <5,5,5,5>, <2,3,4,5> + 2114183170U, // <5,5,2,5>: Cost 2 ins <5,5,u,5>, lane 2 + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 2114199554U, // <5,5,2,7>: Cost 2 ins <5,5,u,7>, lane 2 + 2114207749U, // <5,5,2,u>: Cost 2 ins <5,5,u,u>, lane 5 + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 2785052822U, // <5,5,3,1>: Cost 3 vuzpl <5,5,5,5>, <3,0,1,2> + 3187900418U, // <5,5,3,2>: Cost 3 ins <5,5,u,2>, lane 2 + 1880105089U, // <5,5,3,3>: Cost 2 vzipr <0,1,5,3>, <0,1,5,3> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2114183170U, // <5,5,3,5>: Cost 2 ins <5,5,u,5>, lane 2 + 3205480448U, // <5,5,3,6>: Cost 3 ins <u,5,3,6>, lane 0 + 2131746816U, // <5,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0 + 2131746816U, // <5,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0 + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2716987279U, // <5,5,4,1>: Cost 3 vext3 <5,4,1,5>, <5,4,1,5> + 3187900418U, // <5,5,4,2>: Cost 3 ins <5,5,u,2>, lane 2 + 3187908610U, // <5,5,4,3>: Cost 3 ins <5,5,u,3>, lane 2 + 1845022662U, // <5,5,4,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 1711312182U, // <5,5,4,6>: Cost 2 vuzpl <5,5,5,5>, RHS + 2114199554U, // <5,5,4,7>: Cost 2 ins <5,5,u,7>, lane 2 + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2113986563U, // <5,5,5,1>: Cost 2 ins <5,5,5,u>, lane 3 + 2113986563U, // <5,5,5,2>: Cost 2 ins <5,5,5,u>, lane 3 + 2113986563U, // <5,5,5,3>: Cost 2 ins <5,5,5,u>, lane 3 + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2113986563U, // <5,5,5,6>: Cost 2 ins <5,5,5,u>, lane 3 + 1778421046U, // <5,5,5,7>: Cost 2 vuzpr <5,5,5,5>, RHS + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2131910656U, // <5,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0 + 2131918848U, // <5,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2131927040U, // <5,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2131935232U, // <5,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0 + 2131943424U, // <5,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0 + 2131951616U, // <5,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 1900038658U, // <5,5,6,6>: Cost 2 vzipr <3,4,5,6>, <3,4,5,6> + 1058226176U, // <5,5,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <5,5,6,u>: Cost 1 ins RHS, lane 0 + 2116059137U, // <5,5,7,0>: Cost 2 ins <5,u,7,0>, lane 1 + 2114134019U, // <5,5,7,1>: Cost 2 ins <5,5,7,u>, lane 3 + 2114134019U, // <5,5,7,2>: Cost 2 ins <5,5,7,u>, lane 3 + 2116083713U, // <5,5,7,3>: Cost 2 ins <5,u,7,3>, lane 1 + 2116091905U, // <5,5,7,4>: Cost 2 ins <5,u,7,4>, lane 1 + 2040975364U, // <5,5,7,5>: Cost 2 vtrnr RHS, <5,5,5,5> + 2116108289U, // <5,5,7,6>: Cost 2 ins <5,u,7,6>, lane 1 + 967232822U, // <5,5,7,7>: Cost 1 vtrnr RHS, RHS + 967232823U, // <5,5,7,u>: Cost 1 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 1711314734U, // <5,5,u,2>: Cost 2 vuzpl <5,5,5,5>, LHS + 1778418333U, // <5,5,u,3>: Cost 2 vuzpr <5,5,5,5>, LHS + 1845022662U, // <5,5,u,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6> + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 1711315098U, // <5,5,u,6>: Cost 2 vuzpl <5,5,5,5>, RHS + 967241014U, // <5,5,u,7>: Cost 1 vtrnr RHS, RHS + 967241015U, // <5,5,u,u>: Cost 1 vtrnr RHS, RHS + 2114805762U, // <5,6,0,0>: Cost 2 ins <5,6,u,0>, lane 2 + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2132148224U, // <5,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0 + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2114838530U, // <5,6,0,4>: Cost 2 ins <5,6,u,4>, lane 2 + 3188588546U, // <5,6,0,5>: Cost 3 ins <5,6,u,5>, lane 2 + 3188596738U, // <5,6,0,6>: Cost 3 ins <5,6,u,6>, lane 2 + 2973732150U, // <5,6,0,7>: Cost 3 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2114805762U, // <5,6,1,0>: Cost 2 ins <5,6,u,0>, lane 2 + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2115641345U, // <5,6,1,3>: Cost 2 ins <5,u,1,3>, lane 1 + 2114838530U, // <5,6,1,4>: Cost 2 ins <5,6,u,4>, lane 2 + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 2982366436U, // <5,6,1,6>: Cost 3 vzipr <4,u,5,1>, <4,4,6,6> + 1908624694U, // <5,6,1,7>: Cost 2 vzipr <4,u,5,1>, RHS + 1908624695U, // <5,6,1,u>: Cost 2 vzipr <4,u,5,1>, RHS + 2114805762U, // <5,6,2,0>: Cost 2 ins <5,6,u,0>, lane 2 + 3188555778U, // <5,6,2,1>: Cost 3 ins <5,6,u,1>, lane 2 + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2114871301U, // <5,6,2,3>: Cost 2 ins <5,6,u,u>, lane 5 + 2114838530U, // <5,6,2,4>: Cost 2 ins <5,6,u,4>, lane 2 + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2964458806U, // <5,6,2,7>: Cost 3 vzipr <1,u,5,2>, RHS + 2114805762U, // <5,6,2,u>: Cost 2 ins <5,6,u,0>, lane 2 + 2114805762U, // <5,6,3,0>: Cost 2 ins <5,6,u,0>, lane 2 + 3206103040U, // <5,6,3,1>: Cost 3 ins <u,6,3,1>, lane 0 + 3206111232U, // <5,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0 + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2783119874U, // <5,6,3,5>: Cost 3 vuzpl <5,2,6,3>, <3,4,5,6> + 3206144000U, // <5,6,3,6>: Cost 3 ins <u,6,3,6>, lane 0 + 2132410368U, // <5,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0 + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2114805762U, // <5,6,4,0>: Cost 2 ins <5,6,u,0>, lane 2 + 3189587969U, // <5,6,4,1>: Cost 3 ins <5,u,4,1>, lane 1 + 2918765050U, // <5,6,4,2>: Cost 3 vzipl <5,4,7,6>, <6,2,7,3> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2114838530U, // <5,6,4,4>: Cost 2 ins <5,6,u,4>, lane 2 + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2132475904U, // <5,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0 + 2972437814U, // <5,6,4,7>: Cost 3 vzipr <3,2,5,4>, RHS + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2114805762U, // <5,6,5,0>: Cost 2 ins <5,6,u,0>, lane 2 + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 2982398876U, // <5,6,5,2>: Cost 3 vzipr <4,u,5,5>, <4,0,6,2> + 3189678081U, // <5,6,5,3>: Cost 3 ins <5,u,5,3>, lane 1 + 2114838530U, // <5,6,5,4>: Cost 2 ins <5,6,u,4>, lane 2 + 2115952641U, // <5,6,5,5>: Cost 2 ins <5,u,5,5>, lane 1 + 1772530997U, // <5,6,5,6>: Cost 2 vuzpr <4,5,6,6>, <4,5,6,6> + 1908657462U, // <5,6,5,7>: Cost 2 vzipr <4,u,5,5>, RHS + 1908657463U, // <5,6,5,u>: Cost 2 vzipr <4,u,5,5>, RHS + 2114805762U, // <5,6,6,0>: Cost 2 ins <5,6,u,0>, lane 2 + 3189735425U, // <5,6,6,1>: Cost 3 ins <5,u,6,1>, lane 1 + 2920043002U, // <5,6,6,2>: Cost 3 vzipl <5,6,7,0>, <6,2,7,3> + 2973781298U, // <5,6,6,3>: Cost 3 vzipr <3,4,5,6>, <4,5,6,3> + 2114838530U, // <5,6,6,4>: Cost 2 ins <5,6,u,4>, lane 2 + 2973781138U, // <5,6,6,5>: Cost 3 vzipr <3,4,5,6>, <4,3,6,5> + 2132623360U, // <5,6,6,6>: Cost 2 ins <u,6,6,6>, lane 0 + 1900039478U, // <5,6,6,7>: Cost 2 vzipr <3,4,5,6>, RHS + 1900039479U, // <5,6,6,u>: Cost 2 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1887440182U, // <5,6,7,7>: Cost 2 vzipr <1,3,5,7>, RHS + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1887448374U, // <5,6,u,7>: Cost 2 vzipr <1,3,5,u>, RHS + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 1772535808U, // <5,7,0,0>: Cost 2 vuzpr RHS, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 1772535828U, // <5,7,0,2>: Cost 2 vuzpr RHS, <0,0,2,2> + 2115493890U, // <5,7,0,3>: Cost 2 ins <5,7,u,3>, lane 2 + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2846279860U, // <5,7,0,5>: Cost 3 vuzpr RHS, <3,0,4,5> + 2846277674U, // <5,7,0,6>: Cost 3 vuzpr RHS, <0,0,4,6> + 2115526658U, // <5,7,0,7>: Cost 2 ins <5,7,u,7>, lane 2 + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2115018755U, // <5,7,1,0>: Cost 2 ins <5,7,1,u>, lane 3 + 1772536628U, // <5,7,1,1>: Cost 2 vuzpr RHS, <1,1,1,1> + 2115018755U, // <5,7,1,2>: Cost 2 ins <5,7,1,u>, lane 3 + 698794086U, // <5,7,1,3>: Cost 1 vuzpr RHS, LHS + 2115018755U, // <5,7,1,4>: Cost 2 ins <5,7,1,u>, lane 3 + 2115018755U, // <5,7,1,5>: Cost 2 ins <5,7,1,u>, lane 3 + 2115018755U, // <5,7,1,6>: Cost 2 ins <5,7,1,u>, lane 3 + 2115526658U, // <5,7,1,7>: Cost 2 ins <5,7,u,7>, lane 2 + 698794091U, // <5,7,1,u>: Cost 1 vuzpr RHS, LHS + 1772536726U, // <5,7,2,0>: Cost 2 vuzpr RHS, <1,2,3,0> + 2846277795U, // <5,7,2,1>: Cost 3 vuzpr RHS, <0,2,0,1> + 1772535972U, // <5,7,2,2>: Cost 2 vuzpr RHS, <0,2,0,2> + 1772537458U, // <5,7,2,3>: Cost 2 vuzpr RHS, <2,2,3,3> + 1772536730U, // <5,7,2,4>: Cost 2 vuzpr RHS, <1,2,3,4> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 1772536012U, // <5,7,2,6>: Cost 2 vuzpr RHS, <0,2,4,6> + 2115526658U, // <5,7,2,7>: Cost 2 ins <5,7,u,7>, lane 2 + 1772535978U, // <5,7,2,u>: Cost 2 vuzpr RHS, <0,2,0,u> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 1772537510U, // <5,7,3,1>: Cost 2 vuzpr RHS, <2,3,0,1> + 2846278606U, // <5,7,3,2>: Cost 3 vuzpr RHS, <1,3,0,2> + 1772536792U, // <5,7,3,3>: Cost 2 vuzpr RHS, <1,3,1,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 1772537550U, // <5,7,3,5>: Cost 2 vuzpr RHS, <2,3,4,5> + 2846278628U, // <5,7,3,6>: Cost 3 vuzpr RHS, <1,3,2,6> + 1772536832U, // <5,7,3,7>: Cost 2 vuzpr RHS, <1,3,5,7> + 1772536797U, // <5,7,3,u>: Cost 2 vuzpr RHS, <1,3,1,u> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 2846277958U, // <5,7,4,2>: Cost 3 vuzpr RHS, <0,4,0,2> + 2115493890U, // <5,7,4,3>: Cost 2 ins <5,7,u,3>, lane 2 + 1772539088U, // <5,7,4,4>: Cost 2 vuzpr RHS, <4,4,4,4> + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 1772536156U, // <5,7,4,6>: Cost 2 vuzpr RHS, <0,4,2,6> + 2115526658U, // <5,7,4,7>: Cost 2 ins <5,7,u,7>, lane 2 + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2115313667U, // <5,7,5,0>: Cost 2 ins <5,7,5,u>, lane 3 + 2115313667U, // <5,7,5,1>: Cost 2 ins <5,7,5,u>, lane 3 + 2115313667U, // <5,7,5,2>: Cost 2 ins <5,7,5,u>, lane 3 + 2115493890U, // <5,7,5,3>: Cost 2 ins <5,7,u,3>, lane 2 + 2115313667U, // <5,7,5,4>: Cost 2 ins <5,7,5,u>, lane 3 + 1772539908U, // <5,7,5,5>: Cost 2 vuzpr RHS, <5,5,5,5> + 2115313667U, // <5,7,5,6>: Cost 2 ins <5,7,5,u>, lane 3 + 698797366U, // <5,7,5,7>: Cost 1 vuzpr RHS, RHS + 698797367U, // <5,7,5,u>: Cost 1 vuzpr RHS, RHS + 1772540002U, // <5,7,6,0>: Cost 2 vuzpr RHS, <5,6,7,0> + 2846279577U, // <5,7,6,1>: Cost 3 vuzpr RHS, <2,6,0,1> + 1772539212U, // <5,7,6,2>: Cost 2 vuzpr RHS, <4,6,0,2> + 2115493890U, // <5,7,6,3>: Cost 2 ins <5,7,u,3>, lane 2 + 1772540006U, // <5,7,6,4>: Cost 2 vuzpr RHS, <5,6,7,4> + 2846279617U, // <5,7,6,5>: Cost 3 vuzpr RHS, <2,6,4,5> + 1772539252U, // <5,7,6,6>: Cost 2 vuzpr RHS, <4,6,4,6> + 1772537786U, // <5,7,6,7>: Cost 2 vuzpr RHS, <2,6,3,7> + 1772537787U, // <5,7,6,u>: Cost 2 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 1772540750U, // <5,7,7,1>: Cost 2 vuzpr RHS, <6,7,0,1> + 2846281846U, // <5,7,7,2>: Cost 3 vuzpr RHS, <5,7,0,2> + 1772540032U, // <5,7,7,3>: Cost 2 vuzpr RHS, <5,7,1,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1772540790U, // <5,7,7,5>: Cost 2 vuzpr RHS, <6,7,4,5> + 2116108289U, // <5,7,7,6>: Cost 2 ins <5,u,7,6>, lane 1 + 1772540072U, // <5,7,7,7>: Cost 2 vuzpr RHS, <5,7,5,7> + 1772540037U, // <5,7,7,u>: Cost 2 vuzpr RHS, <5,7,1,u> + 1772537212U, // <5,7,u,0>: Cost 2 vuzpr RHS, <1,u,3,0> + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 1772536458U, // <5,7,u,2>: Cost 2 vuzpr RHS, <0,u,0,2> + 698794653U, // <5,7,u,3>: Cost 1 vuzpr RHS, LHS + 1772537216U, // <5,7,u,4>: Cost 2 vuzpr RHS, <1,u,3,4> + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 1772536480U, // <5,7,u,6>: Cost 2 vuzpr RHS, <0,u,2,6> + 698797609U, // <5,7,u,7>: Cost 1 vuzpr RHS, RHS + 698794658U, // <5,7,u,u>: Cost 1 vuzpr RHS, LHS + 1772544000U, // <5,u,0,0>: Cost 2 vuzpr RHS, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1772544020U, // <5,u,0,2>: Cost 2 vuzpr RHS, <0,0,2,2> + 2111512578U, // <5,u,0,3>: Cost 2 ins <5,1,u,3>, lane 2 + 2114838530U, // <5,u,0,4>: Cost 2 ins <5,6,u,4>, lane 2 + 2114183170U, // <5,u,0,5>: Cost 2 ins <5,5,u,5>, lane 2 + 2113527810U, // <5,u,0,6>: Cost 2 ins <5,4,u,6>, lane 2 + 2114199554U, // <5,u,0,7>: Cost 2 ins <5,5,u,7>, lane 2 + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2114805762U, // <5,u,1,0>: Cost 2 ins <5,6,u,0>, lane 2 + 1772544820U, // <5,u,1,1>: Cost 2 vuzpr RHS, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 698802278U, // <5,u,1,3>: Cost 1 vuzpr RHS, LHS + 2114838530U, // <5,u,1,4>: Cost 2 ins <5,6,u,4>, lane 2 + 1843009690U, // <5,u,1,5>: Cost 2 vzipl <5,1,7,3>, RHS + 1980766362U, // <5,u,1,6>: Cost 2 vtrnl <5,7,1,3>, RHS + 1908624712U, // <5,u,1,7>: Cost 2 vzipr <4,u,5,1>, RHS + 698802283U, // <5,u,1,u>: Cost 1 vuzpr RHS, LHS + 1772544918U, // <5,u,2,0>: Cost 2 vuzpr RHS, <1,2,3,0> + 2128969728U, // <5,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 1772544164U, // <5,u,2,2>: Cost 2 vuzpr RHS, <0,2,0,2> + 1055244288U, // <5,u,2,3>: Cost 1 ins LHS, lane 0 + 1772544922U, // <5,u,2,4>: Cost 2 vuzpr RHS, <1,2,3,4> + 2129002496U, // <5,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 1772544204U, // <5,u,2,6>: Cost 2 vuzpr RHS, <0,2,4,6> + 2114199554U, // <5,u,2,7>: Cost 2 ins <5,5,u,7>, lane 2 + 1055244288U, // <5,u,2,u>: Cost 1 ins LHS, lane 0 + 2129698816U, // <5,u,3,0>: Cost 2 ins <u,2,3,0>, lane 0 + 1772545702U, // <5,u,3,1>: Cost 2 vuzpr RHS, <2,3,0,1> + 2128388096U, // <5,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 1772544984U, // <5,u,3,3>: Cost 2 vuzpr RHS, <1,3,1,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 1772545742U, // <5,u,3,5>: Cost 2 vuzpr RHS, <2,3,4,5> + 2113527810U, // <5,u,3,6>: Cost 2 ins <5,4,u,6>, lane 2 + 1772545024U, // <5,u,3,7>: Cost 2 vuzpr RHS, <1,3,5,7> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 2114805762U, // <5,u,4,0>: Cost 2 ins <5,6,u,0>, lane 2 + 1845024558U, // <5,u,4,1>: Cost 2 vzipl <5,4,7,6>, LHS + 2642897979U, // <5,u,4,2>: Cost 3 vext2 <4,2,5,u>, <4,2,5,u> + 2111512578U, // <5,u,4,3>: Cost 2 ins <5,1,u,3>, lane 2 + 1772547280U, // <5,u,4,4>: Cost 2 vuzpr RHS, <4,4,4,4> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 1772544348U, // <5,u,4,6>: Cost 2 vuzpr RHS, <0,4,2,6> + 2114199554U, // <5,u,4,7>: Cost 2 ins <5,5,u,7>, lane 2 + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1845532462U, // <5,u,5,1>: Cost 2 vzipl <5,5,5,5>, LHS + 1979750190U, // <5,u,5,2>: Cost 2 vtrnl <5,5,5,5>, LHS + 1908654236U, // <5,u,5,3>: Cost 2 vzipr <4,u,5,5>, LHS + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 698805558U, // <5,u,5,7>: Cost 1 vuzpr RHS, RHS + 698805559U, // <5,u,5,u>: Cost 1 vuzpr RHS, RHS + 1772548194U, // <5,u,6,0>: Cost 2 vuzpr RHS, <5,6,7,0> + 1846302510U, // <5,u,6,1>: Cost 2 vzipl <5,6,7,0>, LHS + 1772547404U, // <5,u,6,2>: Cost 2 vuzpr RHS, <4,6,0,2> + 1900036252U, // <5,u,6,3>: Cost 2 vzipr <3,4,5,6>, LHS + 1772548198U, // <5,u,6,4>: Cost 2 vuzpr RHS, <5,6,7,4> + 1846302874U, // <5,u,6,5>: Cost 2 vzipl <5,6,7,0>, RHS + 1772547444U, // <5,u,6,6>: Cost 2 vuzpr RHS, <4,6,4,6> + 1058226176U, // <5,u,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <5,u,6,u>: Cost 1 ins RHS, lane 0 + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 2040971914U, // <5,u,7,2>: Cost 2 vtrnr RHS, <0,u,0,2> + 967230109U, // <5,u,7,3>: Cost 1 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 2040971926U, // <5,u,7,5>: Cost 2 vtrnr RHS, <0,u,1,5> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 967233065U, // <5,u,7,7>: Cost 1 vtrnr RHS, RHS + 967230114U, // <5,u,7,u>: Cost 1 vtrnr RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 698802845U, // <5,u,u,3>: Cost 1 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 698805801U, // <5,u,u,7>: Cost 1 vuzpr RHS, RHS + 698802850U, // <5,u,u,u>: Cost 1 vuzpr RHS, LHS + 2128150528U, // <6,0,0,0>: Cost 2 ins <u,0,0,0>, lane 0 + 2121523201U, // <6,0,0,1>: Cost 2 ins <6,u,0,1>, lane 1 + 1718206566U, // <6,0,0,2>: Cost 2 vuzpl <6,7,0,1>, LHS + 2852933922U, // <6,0,0,3>: Cost 3 vuzpr <5,6,7,0>, <6,0,1,3> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 2852934680U, // <6,0,0,5>: Cost 3 vuzpr <5,6,7,0>, <7,0,4,5> + 2852934690U, // <6,0,0,6>: Cost 3 vuzpr <5,6,7,0>, <7,0,5,6> + 2852933962U, // <6,0,0,7>: Cost 3 vuzpr <5,6,7,0>, <6,0,5,7> + 1718206620U, // <6,0,0,u>: Cost 2 vuzpl <6,7,0,1>, LHS + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 2128232448U, // <6,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0 + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1779187814U, // <6,0,1,3>: Cost 2 vuzpr <5,6,7,0>, LHS + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2791949566U, // <6,0,1,7>: Cost 3 vuzpl <6,7,0,1>, <1,6,7,0> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504280678U, // <6,0,2,0>: Cost 2 vext1 <4,6,0,2>, LHS + 1849639014U, // <6,0,2,1>: Cost 2 vzipl <6,2,7,3>, LHS + 2128314368U, // <6,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0 + 2128322560U, // <6,0,2,3>: Cost 2 ins <u,0,2,3>, lane 0 + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 2578026192U, // <6,0,2,5>: Cost 3 vext1 <4,6,0,2>, <5,1,7,3> + 2578026792U, // <6,0,2,6>: Cost 3 vext1 <4,6,0,2>, <6,0,2,0> + 2578027514U, // <6,0,2,7>: Cost 3 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3202113536U, // <6,0,3,0>: Cost 3 ins <u,0,3,0>, lane 0 + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2128388096U, // <6,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 2852930520U, // <6,0,3,3>: Cost 3 vuzpr <5,6,7,0>, <1,3,1,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 2852931278U, // <6,0,3,5>: Cost 3 vuzpr <5,6,7,0>, <2,3,4,5> + 3190587394U, // <6,0,3,6>: Cost 3 ins <6,0,u,6>, lane 2 + 2852930560U, // <6,0,3,7>: Cost 3 vuzpr <5,6,7,0>, <1,3,5,7> + 2128388096U, // <6,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0 + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3195576321U, // <6,0,4,3>: Cost 3 ins <6,u,4,3>, lane 1 + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2121850881U, // <6,0,4,5>: Cost 2 ins <6,u,4,5>, lane 1 + 1718209846U, // <6,0,4,6>: Cost 2 vuzpl <6,7,0,1>, RHS + 3195609089U, // <6,0,4,7>: Cost 3 ins <6,u,4,7>, lane 1 + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3202260992U, // <6,0,5,0>: Cost 3 ins <u,0,5,0>, lane 0 + 2128527360U, // <6,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0 + 3056156774U, // <6,0,5,2>: Cost 3 vtrnl <6,0,5,7>, LHS + 3190562818U, // <6,0,5,3>: Cost 3 ins <6,0,u,3>, lane 2 + 3058802892U, // <6,0,5,4>: Cost 3 vtrnl <6,4,5,6>, <0,2,4,6> + 2852933636U, // <6,0,5,5>: Cost 3 vuzpr <5,6,7,0>, <5,5,5,5> + 2852932908U, // <6,0,5,6>: Cost 3 vuzpr <5,6,7,0>, <4,5,5,6> + 1779191094U, // <6,0,5,7>: Cost 2 vuzpr <5,6,7,0>, RHS + 1779191095U, // <6,0,5,u>: Cost 2 vuzpr <5,6,7,0>, RHS + 1779191906U, // <6,0,6,0>: Cost 2 vuzpr <5,6,7,0>, <5,6,7,0> + 1852244070U, // <6,0,6,1>: Cost 2 vzipl <6,6,6,6>, LHS + 1986461798U, // <6,0,6,2>: Cost 2 vtrnl <6,6,6,6>, LHS + 3195723777U, // <6,0,6,3>: Cost 3 ins <6,u,6,3>, lane 1 + 2852933734U, // <6,0,6,4>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,4> + 3195740161U, // <6,0,6,5>: Cost 3 ins <6,u,6,5>, lane 1 + 2122006529U, // <6,0,6,6>: Cost 2 ins <6,u,6,6>, lane 1 + 2128650240U, // <6,0,6,7>: Cost 2 ins <u,0,6,7>, lane 0 + 1852244637U, // <6,0,6,u>: Cost 2 vzipl <6,6,6,6>, LHS + 1906753536U, // <6,0,7,0>: Cost 2 vzipr RHS, <0,0,0,0> + 1906755238U, // <6,0,7,1>: Cost 2 vzipr RHS, <2,3,0,1> + 1906753700U, // <6,0,7,2>: Cost 2 vzipr RHS, <0,2,0,2> + 2122055681U, // <6,0,7,3>: Cost 2 ins <6,u,7,3>, lane 1 + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 2980496418U, // <6,0,7,5>: Cost 3 vzipr RHS, <1,4,0,5> + 2980495690U, // <6,0,7,6>: Cost 3 vzipr RHS, <0,4,0,6> + 2122088449U, // <6,0,7,7>: Cost 2 ins <6,u,7,7>, lane 1 + 1906753706U, // <6,0,7,u>: Cost 2 vzipr RHS, <0,2,0,u> + 1906761728U, // <6,0,u,0>: Cost 2 vzipr RHS, <0,0,0,0> + 1906763430U, // <6,0,u,1>: Cost 2 vzipr RHS, <2,3,0,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1779188381U, // <6,0,u,3>: Cost 2 vuzpr <5,6,7,0>, LHS + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2121850881U, // <6,0,u,5>: Cost 2 ins <6,u,4,5>, lane 1 + 1718212762U, // <6,0,u,6>: Cost 2 vuzpl <6,7,0,1>, RHS + 1779191337U, // <6,0,u,7>: Cost 2 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2121523201U, // <6,1,0,1>: Cost 2 ins <6,u,0,1>, lane 1 + 2846673046U, // <6,1,0,2>: Cost 3 vuzpr <4,6,3,1>, <3,0,1,2> + 2047623270U, // <6,1,0,3>: Cost 2 vtrnr <5,6,7,0>, LHS + 2787385548U, // <6,1,0,4>: Cost 3 vuzpl <6,0,1,2>, <0,2,4,6> + 3060384768U, // <6,1,0,5>: Cost 3 vtrnl <6,7,0,1>, <1,3,5,7> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 3060385022U, // <6,1,0,7>: Cost 3 vtrnl <6,7,0,1>, <1,6,7,0> + 2047623275U, // <6,1,0,u>: Cost 2 vtrnr <5,6,7,0>, LHS + 2578088038U, // <6,1,1,0>: Cost 3 vext1 <4,6,1,1>, LHS + 2128896000U, // <6,1,1,1>: Cost 2 ins <u,1,1,1>, lane 0 + 2981778426U, // <6,1,1,2>: Cost 3 vzipr <4,7,6,1>, <7,0,1,2> + 2128912384U, // <6,1,1,3>: Cost 2 ins <u,1,1,3>, lane 0 + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3202670592U, // <6,1,1,5>: Cost 3 ins <u,1,1,5>, lane 0 + 2691482470U, // <6,1,1,6>: Cost 3 vext3 <1,1,6,6>, <1,1,6,6> + 2980449545U, // <6,1,1,7>: Cost 3 vzipr <4,5,6,1>, <4,5,1,7> + 2128896000U, // <6,1,1,u>: Cost 2 ins <u,1,1,1>, lane 0 + 2128961536U, // <6,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0 + 2128969728U, // <6,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 2128977920U, // <6,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0 + 1055244288U, // <6,1,2,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <6,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0 + 2129002496U, // <6,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2129010688U, // <6,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2129018880U, // <6,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0 + 1055244288U, // <6,1,2,u>: Cost 1 ins LHS, lane 0 + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 2129059840U, // <6,1,3,3>: Cost 2 ins <u,1,3,3>, lane 0 + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 2953923849U, // <6,1,3,7>: Cost 3 vzipr <0,1,6,3>, <4,5,1,7> + 2129059840U, // <6,1,3,u>: Cost 2 ins <u,1,3,3>, lane 0 + 2788724044U, // <6,1,4,0>: Cost 3 vuzpl <6,2,1,3>, <4,6,0,2> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3195568129U, // <6,1,4,2>: Cost 3 ins <6,u,4,2>, lane 1 + 2047656038U, // <6,1,4,3>: Cost 2 vtrnr <5,6,7,4>, LHS + 2791378292U, // <6,1,4,4>: Cost 3 vuzpl <6,6,1,3>, <4,6,4,6> + 2121850881U, // <6,1,4,5>: Cost 2 ins <6,u,4,5>, lane 1 + 2834506076U, // <6,1,4,6>: Cost 3 vuzpr <2,6,0,1>, <0,4,2,6> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2047656043U, // <6,1,4,u>: Cost 2 vtrnr <5,6,7,4>, LHS + 2578120806U, // <6,1,5,0>: Cost 3 vext1 <4,6,1,5>, LHS + 2578121728U, // <6,1,5,1>: Cost 3 vext1 <4,6,1,5>, <1,3,5,7> + 3202940928U, // <6,1,5,2>: Cost 3 ins <u,1,5,2>, lane 0 + 2129207296U, // <6,1,5,3>: Cost 2 ins <u,1,5,3>, lane 0 + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3202965504U, // <6,1,5,5>: Cost 3 ins <u,1,5,5>, lane 0 + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 2834509110U, // <6,1,5,7>: Cost 3 vuzpr <2,6,0,1>, RHS + 2129207296U, // <6,1,5,u>: Cost 2 ins <u,1,5,3>, lane 0 + 2925986550U, // <6,1,6,0>: Cost 3 vzipl <6,6,6,6>, <1,0,3,2> + 2834507673U, // <6,1,6,1>: Cost 3 vuzpr <2,6,0,1>, <2,6,0,1> + 2982480022U, // <6,1,6,2>: Cost 3 vzipr <4,u,6,6>, <3,0,1,2> + 2041479270U, // <6,1,6,3>: Cost 2 vtrnr <4,6,4,6>, LHS + 2602020150U, // <6,1,6,4>: Cost 3 vext1 <u,6,1,6>, RHS + 2982478162U, // <6,1,6,5>: Cost 3 vzipr <4,u,6,6>, <0,4,1,5> + 2122006529U, // <6,1,6,6>: Cost 2 ins <6,u,6,6>, lane 1 + 2129313792U, // <6,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0 + 2041479275U, // <6,1,6,u>: Cost 2 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 1906753546U, // <6,1,7,1>: Cost 2 vzipr RHS, <0,0,1,1> + 1906755734U, // <6,1,7,2>: Cost 2 vzipr RHS, <3,0,1,2> + 2029469798U, // <6,1,7,3>: Cost 2 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 1906753874U, // <6,1,7,5>: Cost 2 vzipr RHS, <0,4,1,5> + 2980495537U, // <6,1,7,6>: Cost 3 vzipr RHS, <0,2,1,6> + 2122088449U, // <6,1,7,7>: Cost 2 ins <6,u,7,7>, lane 1 + 2029469803U, // <6,1,7,u>: Cost 2 vtrnr <2,6,3,7>, LHS + 2128961536U, // <6,1,u,0>: Cost 2 ins <u,1,2,0>, lane 0 + 1906761738U, // <6,1,u,1>: Cost 2 vzipr RHS, <0,0,1,1> + 1906763926U, // <6,1,u,2>: Cost 2 vzipr RHS, <3,0,1,2> + 1055244288U, // <6,1,u,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <6,1,u,4>: Cost 2 ins <u,1,2,4>, lane 0 + 1906762066U, // <6,1,u,5>: Cost 2 vzipr RHS, <0,4,1,5> + 2129010688U, // <6,1,u,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2122088449U, // <6,1,u,7>: Cost 2 ins <6,u,7,7>, lane 1 + 1055244288U, // <6,1,u,u>: Cost 1 ins LHS, lane 0 + 2846457856U, // <6,2,0,0>: Cost 3 vuzpr <4,6,0,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2129494016U, // <6,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0 + 2118148098U, // <6,2,0,3>: Cost 2 ins <6,2,u,3>, lane 2 + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3195297793U, // <6,2,0,5>: Cost 3 ins <6,u,0,5>, lane 1 + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3195314177U, // <6,2,0,7>: Cost 3 ins <6,u,0,7>, lane 1 + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2846458676U, // <6,2,1,1>: Cost 3 vuzpr <4,6,0,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 1772716134U, // <6,2,1,3>: Cost 2 vuzpr <4,6,0,2>, LHS + 3191414787U, // <6,2,1,4>: Cost 3 ins <6,2,1,u>, lane 3 + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 3114885324U, // <6,2,1,6>: Cost 3 vtrnr <4,6,0,1>, <0,2,4,6> + 3191922690U, // <6,2,1,7>: Cost 3 ins <6,2,u,7>, lane 2 + 1772716139U, // <6,2,1,u>: Cost 2 vuzpr <4,6,0,2>, LHS + 2846458774U, // <6,2,2,0>: Cost 3 vuzpr <4,6,0,2>, <1,2,3,0> + 3195412481U, // <6,2,2,1>: Cost 3 ins <6,u,2,1>, lane 1 + 2129641472U, // <6,2,2,2>: Cost 2 ins <u,2,2,2>, lane 0 + 1908703334U, // <6,2,2,3>: Cost 2 vzipr <4,u,6,2>, LHS + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3195445249U, // <6,2,2,5>: Cost 3 ins <6,u,2,5>, lane 1 + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 2846462444U, // <6,2,2,7>: Cost 3 vuzpr <4,6,0,2>, <6,2,5,7> + 1908703339U, // <6,2,2,u>: Cost 2 vzipr <4,u,6,2>, LHS + 2129698816U, // <6,2,3,0>: Cost 2 ins <u,2,3,0>, lane 0 + 2230618020U, // <6,2,3,1>: Cost 3 vrev <2,6,1,3> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2129723392U, // <6,2,3,3>: Cost 2 ins <u,2,3,3>, lane 0 + 2129731584U, // <6,2,3,4>: Cost 2 ins <u,2,3,4>, lane 0 + 2846459598U, // <6,2,3,5>: Cost 3 vuzpr <4,6,0,2>, <2,3,4,5> + 2966528348U, // <6,2,3,6>: Cost 3 vzipr <2,2,6,3>, <0,4,2,6> + 2846458880U, // <6,2,3,7>: Cost 3 vuzpr <4,6,0,2>, <1,3,5,7> + 2129698816U, // <6,2,3,u>: Cost 2 ins <u,2,3,0>, lane 0 + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3191873538U, // <6,2,4,1>: Cost 3 ins <6,2,u,1>, lane 2 + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2118148098U, // <6,2,4,3>: Cost 2 ins <6,2,u,3>, lane 2 + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2129821696U, // <6,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0 + 3195609089U, // <6,2,4,7>: Cost 3 ins <6,u,4,7>, lane 1 + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3191709699U, // <6,2,5,0>: Cost 3 ins <6,2,5,u>, lane 3 + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3203604480U, // <6,2,5,2>: Cost 3 ins <u,2,5,2>, lane 0 + 2118148098U, // <6,2,5,3>: Cost 2 ins <6,2,u,3>, lane 2 + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2846461956U, // <6,2,5,5>: Cost 3 vuzpr <4,6,0,2>, <5,5,5,5> + 3115213004U, // <6,2,5,6>: Cost 3 vtrnr <4,6,4,5>, <0,2,4,6> + 1772719414U, // <6,2,5,7>: Cost 2 vuzpr <4,6,0,2>, RHS + 1772719415U, // <6,2,5,u>: Cost 2 vuzpr <4,6,0,2>, RHS + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 3195707393U, // <6,2,6,1>: Cost 3 ins <6,u,6,1>, lane 1 + 1772719436U, // <6,2,6,2>: Cost 2 vuzpr <4,6,0,2>, <4,6,0,2> + 1908736102U, // <6,2,6,3>: Cost 2 vzipr <4,u,6,6>, LHS + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 3195740161U, // <6,2,6,5>: Cost 3 ins <6,u,6,5>, lane 1 + 2122006529U, // <6,2,6,6>: Cost 2 ins <6,u,6,6>, lane 1 + 2118189061U, // <6,2,6,7>: Cost 2 ins <6,2,u,u>, lane 5 + 1908736107U, // <6,2,6,u>: Cost 2 vzipr <4,u,6,6>, LHS + 2118115331U, // <6,2,7,0>: Cost 2 ins <6,2,7,u>, lane 3 + 2118115331U, // <6,2,7,1>: Cost 2 ins <6,2,7,u>, lane 3 + 1906753556U, // <6,2,7,2>: Cost 2 vzipr RHS, <0,0,2,2> + 833011814U, // <6,2,7,3>: Cost 1 vzipr RHS, LHS + 2118115331U, // <6,2,7,4>: Cost 2 ins <6,2,7,u>, lane 3 + 2118115331U, // <6,2,7,5>: Cost 2 ins <6,2,7,u>, lane 3 + 1906753884U, // <6,2,7,6>: Cost 2 vzipr RHS, <0,4,2,6> + 2122088449U, // <6,2,7,7>: Cost 2 ins <6,u,7,7>, lane 1 + 833011819U, // <6,2,7,u>: Cost 1 vzipr RHS, LHS + 2129698816U, // <6,2,u,0>: Cost 2 ins <u,2,3,0>, lane 0 + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 1906761748U, // <6,2,u,2>: Cost 2 vzipr RHS, <0,0,2,2> + 833020006U, // <6,2,u,3>: Cost 1 vzipr RHS, LHS + 2129731584U, // <6,2,u,4>: Cost 2 ins <u,2,3,4>, lane 0 + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 1906762076U, // <6,2,u,6>: Cost 2 vzipr RHS, <0,4,2,6> + 1772719657U, // <6,2,u,7>: Cost 2 vuzpr <4,6,0,2>, RHS + 833020011U, // <6,2,u,u>: Cost 1 vzipr RHS, LHS + 3203883008U, // <6,3,0,0>: Cost 3 ins <u,3,0,0>, lane 0 + 2130149376U, // <6,3,0,1>: Cost 2 ins <u,3,0,1>, lane 0 + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3121365976U, // <6,3,0,3>: Cost 3 vtrnr <5,6,7,0>, <1,3,1,3> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 3121366734U, // <6,3,0,5>: Cost 3 vtrnr <5,6,7,0>, <2,3,4,5> + 3195305985U, // <6,3,0,6>: Cost 3 ins <6,u,0,6>, lane 1 + 3121366016U, // <6,3,0,7>: Cost 3 vtrnr <5,6,7,0>, <1,3,5,7> + 2130149376U, // <6,3,0,u>: Cost 2 ins <u,3,0,1>, lane 0 + 2578235494U, // <6,3,1,0>: Cost 3 vext1 <4,6,3,1>, LHS + 3203964928U, // <6,3,1,1>: Cost 3 ins <u,3,1,1>, lane 0 + 3203973120U, // <6,3,1,2>: Cost 3 ins <u,3,1,2>, lane 0 + 2130239488U, // <6,3,1,3>: Cost 2 ins <u,3,1,3>, lane 0 + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3203997696U, // <6,3,1,5>: Cost 3 ins <u,3,1,5>, lane 0 + 2822725737U, // <6,3,1,6>: Cost 3 vuzpr <0,6,2,3>, <0,1,2,6> + 2970494906U, // <6,3,1,7>: Cost 3 vzipr <2,u,6,1>, <2,6,3,7> + 2130239488U, // <6,3,1,u>: Cost 2 ins <u,3,1,3>, lane 0 + 2982445974U, // <6,3,2,0>: Cost 3 vzipr <4,u,6,2>, <1,2,3,0> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 2630985357U, // <6,3,2,2>: Cost 3 vext2 <2,2,6,3>, <2,2,6,3> + 2130313216U, // <6,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0 + 2982445978U, // <6,3,2,4>: Cost 3 vzipr <4,u,6,2>, <1,2,3,4> + 3114895054U, // <6,3,2,5>: Cost 3 vtrnr <4,6,0,2>, <2,3,4,5> + 2834596044U, // <6,3,2,6>: Cost 3 vuzpr <2,6,1,3>, <0,2,4,6> + 3114894336U, // <6,3,2,7>: Cost 3 vtrnr <4,6,0,2>, <1,3,5,7> + 2130313216U, // <6,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0 + 2578251878U, // <6,3,3,0>: Cost 3 vext1 <4,6,3,3>, LHS + 2792163478U, // <6,3,3,1>: Cost 3 vuzpl <6,7,3,0>, <3,0,1,2> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2130386944U, // <6,3,3,3>: Cost 2 ins <u,3,3,3>, lane 0 + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 2792196610U, // <6,3,3,5>: Cost 3 vuzpl <6,7,3,4>, <3,4,5,6> + 2590200602U, // <6,3,3,6>: Cost 3 vext1 <6,6,3,3>, <6,6,3,3> + 2972501946U, // <6,3,3,7>: Cost 3 vzipr <3,2,6,3>, <2,6,3,7> + 2130386944U, // <6,3,3,u>: Cost 2 ins <u,3,3,3>, lane 0 + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2705050078U, // <6,3,4,1>: Cost 3 vext3 <3,4,1,6>, <3,4,1,6> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2846540124U, // <6,3,4,6>: Cost 3 vuzpr <4,6,1,3>, <0,4,2,6> + 3121398784U, // <6,3,4,7>: Cost 3 vtrnr <5,6,7,4>, <1,3,5,7> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 2578268262U, // <6,3,5,0>: Cost 3 vext1 <4,6,3,5>, LHS + 3204259840U, // <6,3,5,1>: Cost 3 ins <u,3,5,1>, lane 0 + 2648903448U, // <6,3,5,2>: Cost 3 vext2 <5,2,6,3>, <5,2,6,3> + 2578270722U, // <6,3,5,3>: Cost 3 vext1 <4,6,3,5>, <3,4,5,6> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3204292608U, // <6,3,5,5>: Cost 3 ins <u,3,5,5>, lane 0 + 3204300800U, // <6,3,5,6>: Cost 3 ins <u,3,5,6>, lane 0 + 2130567168U, // <6,3,5,7>: Cost 2 ins <u,3,5,7>, lane 0 + 2130567168U, // <6,3,5,u>: Cost 2 ins <u,3,5,7>, lane 0 + 2982478742U, // <6,3,6,0>: Cost 3 vzipr <4,u,6,6>, <1,2,3,0> + 3115222694U, // <6,3,6,1>: Cost 3 vtrnr <4,6,4,6>, <2,3,0,1> + 2982478582U, // <6,3,6,2>: Cost 3 vzipr <4,u,6,6>, <1,0,3,2> + 1748984315U, // <6,3,6,3>: Cost 2 vuzpr <0,6,2,3>, <0,6,2,3> + 2982478746U, // <6,3,6,4>: Cost 3 vzipr <4,u,6,6>, <1,2,3,4> + 3115222734U, // <6,3,6,5>: Cost 3 vtrnr <4,6,4,6>, <2,3,4,5> + 2122006529U, // <6,3,6,6>: Cost 2 ins <6,u,6,6>, lane 1 + 2130640896U, // <6,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0 + 1748984315U, // <6,3,6,u>: Cost 2 vuzpr <0,6,2,3>, <0,6,2,3> + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 1906754376U, // <6,3,7,3>: Cost 2 vzipr RHS, <1,1,3,3> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 3103213262U, // <6,3,7,5>: Cost 3 vtrnr <2,6,3,7>, <2,3,4,5> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 1906754704U, // <6,3,7,7>: Cost 2 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2130149376U, // <6,3,u,1>: Cost 2 ins <u,3,0,1>, lane 0 + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 1906762568U, // <6,3,u,3>: Cost 2 vzipr RHS, <1,1,3,3> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2122006529U, // <6,3,u,6>: Cost 2 ins <6,u,6,6>, lane 1 + 1906762896U, // <6,3,u,7>: Cost 2 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 2242465098U, // <6,4,0,0>: Cost 3 vrev <4,6,0,0> + 2121523201U, // <6,4,0,1>: Cost 2 ins <6,u,0,1>, lane 1 + 1718534246U, // <6,4,0,2>: Cost 2 vuzpl <6,7,4,5>, LHS + 3195281409U, // <6,4,0,3>: Cost 3 ins <6,u,0,3>, lane 1 + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 1986645302U, // <6,4,0,6>: Cost 2 vtrnl <6,7,0,1>, RHS + 3195314177U, // <6,4,0,7>: Cost 3 ins <6,u,0,7>, lane 1 + 1986645320U, // <6,4,0,u>: Cost 2 vtrnl <6,7,0,1>, RHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 2242547028U, // <6,4,1,1>: Cost 3 vrev <4,6,1,1> + 3204636672U, // <6,4,1,2>: Cost 3 ins <u,4,1,2>, lane 0 + 1779220582U, // <6,4,1,3>: Cost 2 vuzpr <5,6,7,4>, LHS + 3059813748U, // <6,4,1,4>: Cost 3 vtrnl <6,6,1,3>, <4,6,4,6> + 2130919424U, // <6,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0 + 3102941532U, // <6,4,1,6>: Cost 3 vtrnr <2,6,0,1>, <0,4,2,6> + 2242989450U, // <6,4,1,7>: Cost 3 vrev <4,6,7,1> + 1779220587U, // <6,4,1,u>: Cost 2 vuzpr <5,6,7,4>, LHS + 1168739660U, // <6,4,2,0>: Cost 2 vrev <4,6,0,2> + 3195412481U, // <6,4,2,1>: Cost 3 ins <6,u,2,1>, lane 1 + 2242628958U, // <6,4,2,2>: Cost 3 vrev <4,6,2,2> + 2130976768U, // <6,4,2,3>: Cost 2 ins <u,4,2,3>, lane 0 + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 1849642294U, // <6,4,2,5>: Cost 2 vzipl <6,2,7,3>, RHS + 2131001344U, // <6,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 3195461633U, // <6,4,2,7>: Cost 3 ins <6,u,2,7>, lane 1 + 1169329556U, // <6,4,2,u>: Cost 2 vrev <4,6,u,2> + 3195478017U, // <6,4,3,0>: Cost 3 ins <6,u,3,0>, lane 1 + 2242563414U, // <6,4,3,1>: Cost 3 vrev <4,6,1,3> + 2242637151U, // <6,4,3,2>: Cost 3 vrev <4,6,2,3> + 2242710888U, // <6,4,3,3>: Cost 3 vrev <4,6,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 2846623438U, // <6,4,3,5>: Cost 3 vuzpr <4,6,2,4>, <2,3,4,5> + 2965864652U, // <6,4,3,6>: Cost 3 vzipr <2,1,6,3>, <0,2,4,6> + 2852963328U, // <6,4,3,7>: Cost 3 vuzpr <5,6,7,4>, <1,3,5,7> + 2243079573U, // <6,4,3,u>: Cost 3 vrev <4,6,u,3> + 2242497870U, // <6,4,4,0>: Cost 3 vrev <4,6,0,4> + 2852967732U, // <6,4,4,1>: Cost 3 vuzpr <5,6,7,4>, <7,4,0,1> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 2852967014U, // <6,4,4,3>: Cost 3 vuzpr <5,6,7,4>, <6,4,1,3> + 2131132416U, // <6,4,4,4>: Cost 2 ins <u,4,4,4>, lane 0 + 2121850881U, // <6,4,4,5>: Cost 2 ins <6,u,4,5>, lane 1 + 1718537526U, // <6,4,4,6>: Cost 2 vuzpl <6,7,4,5>, RHS + 2852967054U, // <6,4,4,7>: Cost 3 vuzpr <5,6,7,4>, <6,4,5,7> + 1718537544U, // <6,4,4,u>: Cost 2 vuzpl <6,7,4,5>, RHS + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 2242579800U, // <6,4,5,1>: Cost 3 vrev <4,6,1,5> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2242727274U, // <6,4,5,3>: Cost 3 vrev <4,6,3,5> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2131214336U, // <6,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0 + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1779223862U, // <6,4,5,7>: Cost 2 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1169067380U, // <6,4,6,4>: Cost 2 vrev <4,6,4,6> + 1852247350U, // <6,4,6,5>: Cost 2 vzipl <6,6,6,6>, RHS + 1986465078U, // <6,4,6,6>: Cost 2 vtrnl <6,6,6,6>, RHS + 2131304448U, // <6,4,6,7>: Cost 2 ins <u,4,6,7>, lane 0 + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 2980495398U, // <6,4,7,2>: Cost 3 vzipr RHS, <0,0,4,2> + 2122055681U, // <6,4,7,3>: Cost 2 ins <6,u,7,3>, lane 1 + 1906756816U, // <6,4,7,4>: Cost 2 vzipr RHS, <4,4,4,4> + 1906755278U, // <6,4,7,5>: Cost 2 vzipr RHS, <2,3,4,5> + 1906753740U, // <6,4,7,6>: Cost 2 vzipr RHS, <0,2,4,6> + 2122088449U, // <6,4,7,7>: Cost 2 ins <6,u,7,7>, lane 1 + 1906753742U, // <6,4,7,u>: Cost 2 vzipr RHS, <0,2,4,u> + 1168788818U, // <6,4,u,0>: Cost 2 vrev <4,6,0,u> + 2121523201U, // <6,4,u,1>: Cost 2 ins <6,u,0,1>, lane 1 + 1718540078U, // <6,4,u,2>: Cost 2 vuzpl <6,7,4,5>, LHS + 1779221149U, // <6,4,u,3>: Cost 2 vuzpr <5,6,7,4>, LHS + 1906765008U, // <6,4,u,4>: Cost 2 vzipr RHS, <4,4,4,4> + 1906763470U, // <6,4,u,5>: Cost 2 vzipr RHS, <2,3,4,5> + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1779224105U, // <6,4,u,7>: Cost 2 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3195256833U, // <6,5,0,0>: Cost 3 ins <6,u,0,0>, lane 1 + 2121523201U, // <6,5,0,1>: Cost 2 ins <6,u,0,1>, lane 1 + 2787721318U, // <6,5,0,2>: Cost 3 vuzpl <6,0,5,7>, LHS + 3195281409U, // <6,5,0,3>: Cost 3 ins <6,u,0,3>, lane 1 + 2790367436U, // <6,5,0,4>: Cost 3 vuzpl <6,4,5,6>, <0,2,4,6> + 3121369092U, // <6,5,0,5>: Cost 3 vtrnr <5,6,7,0>, <5,5,5,5> + 2980440578U, // <6,5,0,6>: Cost 3 vzipr <4,5,6,0>, <3,4,5,6> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 2047626551U, // <6,5,0,u>: Cost 2 vtrnr <5,6,7,0>, RHS + 2578382950U, // <6,5,1,0>: Cost 3 vext1 <4,6,5,1>, LHS + 3205292032U, // <6,5,1,1>: Cost 3 ins <u,5,1,1>, lane 0 + 3195346945U, // <6,5,1,2>: Cost 3 ins <6,u,1,2>, lane 1 + 2834833510U, // <6,5,1,3>: Cost 3 vuzpr <2,6,4,5>, LHS + 2578386296U, // <6,5,1,4>: Cost 3 vext1 <4,6,5,1>, <4,6,5,1> + 2578387072U, // <6,5,1,5>: Cost 3 vext1 <4,6,5,1>, <5,7,1,3> + 2922205282U, // <6,5,1,6>: Cost 3 vzipl <6,1,0,3>, <5,6,7,0> + 2131599360U, // <6,5,1,7>: Cost 2 ins <u,5,1,7>, lane 0 + 2131599360U, // <6,5,1,u>: Cost 2 ins <u,5,1,7>, lane 0 + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 2982448018U, // <6,5,2,1>: Cost 3 vzipr <4,u,6,2>, <4,0,5,1> + 3195420673U, // <6,5,2,2>: Cost 3 ins <6,u,2,2>, lane 1 + 2131640320U, // <6,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0 + 2578394489U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, <4,6,5,2> + 3114897412U, // <6,5,2,5>: Cost 3 vtrnr <4,6,0,2>, <5,5,5,5> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 2041154870U, // <6,5,2,7>: Cost 2 vtrnr <4,6,0,2>, RHS + 2041154871U, // <6,5,2,u>: Cost 2 vtrnr <4,6,0,2>, RHS + 3195478017U, // <6,5,3,0>: Cost 3 ins <6,u,3,0>, lane 1 + 3205439488U, // <6,5,3,1>: Cost 3 ins <u,5,3,1>, lane 0 + 3091164465U, // <6,5,3,2>: Cost 3 vtrnr <0,6,2,3>, <4,5,6,2> + 3195502593U, // <6,5,3,3>: Cost 3 ins <6,u,3,3>, lane 1 + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3205472256U, // <6,5,3,5>: Cost 3 ins <u,5,3,5>, lane 0 + 2980465154U, // <6,5,3,6>: Cost 3 vzipr <4,5,6,3>, <3,4,5,6> + 2131746816U, // <6,5,3,7>: Cost 2 ins <u,5,3,7>, lane 0 + 2131746816U, // <6,5,3,u>: Cost 2 ins <u,5,3,7>, lane 0 + 2789051724U, // <6,5,4,0>: Cost 3 vuzpl <6,2,5,7>, <4,6,0,2> + 3060715648U, // <6,5,4,1>: Cost 3 vtrnl <6,7,4,5>, <5,7,1,3> + 3195568129U, // <6,5,4,2>: Cost 3 ins <6,u,4,2>, lane 1 + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2791705972U, // <6,5,4,4>: Cost 3 vuzpl <6,6,5,7>, <4,6,4,6> + 2121850881U, // <6,5,4,5>: Cost 2 ins <6,u,4,5>, lane 1 + 2834833756U, // <6,5,4,6>: Cost 3 vuzpr <2,6,4,5>, <0,4,2,6> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3006363382U, // <6,5,5,1>: Cost 3 vzipr <u,u,6,5>, <u,0,5,1> + 3205595136U, // <6,5,5,2>: Cost 3 ins <u,5,5,2>, lane 0 + 2980479105U, // <6,5,5,3>: Cost 3 vzipr <4,5,6,5>, <0,1,5,3> + 2578419068U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, <4,6,5,5> + 2131877888U, // <6,5,5,5>: Cost 2 ins <u,5,5,5>, lane 0 + 2979154434U, // <6,5,5,6>: Cost 3 vzipr <4,3,6,5>, <3,4,5,6> + 2131894272U, // <6,5,5,7>: Cost 2 ins <u,5,5,7>, lane 0 + 2131877888U, // <6,5,5,u>: Cost 2 ins <u,5,5,5>, lane 0 + 2131910656U, // <6,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0 + 2131918848U, // <6,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2131927040U, // <6,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2131935232U, // <6,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0 + 2131943424U, // <6,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0 + 2131951616U, // <6,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 2131959808U, // <6,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0 + 1058226176U, // <6,5,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <6,5,6,u>: Cost 1 ins RHS, lane 0 + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 1906756498U, // <6,5,7,1>: Cost 2 vzipr RHS, <4,0,5,1> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 2122055681U, // <6,5,7,3>: Cost 2 ins <6,u,7,3>, lane 1 + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 1906756826U, // <6,5,7,5>: Cost 2 vzipr RHS, <4,4,5,5> + 1906756098U, // <6,5,7,6>: Cost 2 vzipr RHS, <3,4,5,6> + 2029473078U, // <6,5,7,7>: Cost 2 vtrnr <2,6,3,7>, RHS + 2029473079U, // <6,5,7,u>: Cost 2 vtrnr <2,6,3,7>, RHS + 2131910656U, // <6,5,u,0>: Cost 2 ins <u,5,6,0>, lane 0 + 1906764690U, // <6,5,u,1>: Cost 2 vzipr RHS, <4,0,5,1> + 2131927040U, // <6,5,u,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2122055681U, // <6,5,u,3>: Cost 2 ins <6,u,7,3>, lane 1 + 2131943424U, // <6,5,u,4>: Cost 2 ins <u,5,6,4>, lane 0 + 1906765018U, // <6,5,u,5>: Cost 2 vzipr RHS, <4,4,5,5> + 1906764290U, // <6,5,u,6>: Cost 2 vzipr RHS, <3,4,5,6> + 1058226176U, // <6,5,u,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <6,5,u,u>: Cost 1 ins RHS, lane 0 + 2047627362U, // <6,6,0,0>: Cost 2 vtrnr <5,6,7,0>, <5,6,7,0> + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 1718026342U, // <6,6,0,2>: Cost 2 vuzpl <6,6,6,6>, LHS + 3195281409U, // <6,6,0,3>: Cost 3 ins <6,u,0,3>, lane 1 + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3195297793U, // <6,6,0,5>: Cost 3 ins <6,u,0,5>, lane 1 + 2120826882U, // <6,6,0,6>: Cost 2 ins <6,6,u,6>, lane 2 + 2120835074U, // <6,6,0,7>: Cost 2 ins <6,6,u,7>, lane 2 + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 1906707760U, // <6,6,1,1>: Cost 2 vzipr <4,5,6,1>, <4,5,6,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 1773043814U, // <6,6,1,3>: Cost 2 vuzpr <4,6,4,6>, LHS + 3194068995U, // <6,6,1,4>: Cost 3 ins <6,6,1,u>, lane 3 + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2120826882U, // <6,6,1,6>: Cost 2 ins <6,6,u,6>, lane 2 + 2120835074U, // <6,6,1,7>: Cost 2 ins <6,6,u,7>, lane 2 + 1773043819U, // <6,6,1,u>: Cost 2 vuzpr <4,6,4,6>, LHS + 3114896750U, // <6,6,2,0>: Cost 3 vtrnr <4,6,0,2>, <4,6,4,0> + 3195412481U, // <6,6,2,1>: Cost 3 ins <6,u,2,1>, lane 1 + 2041154892U, // <6,6,2,2>: Cost 2 vtrnr <4,6,0,2>, <4,6,0,2> + 2120843269U, // <6,6,2,3>: Cost 2 ins <6,6,u,u>, lane 5 + 3114897510U, // <6,6,2,4>: Cost 3 vtrnr <4,6,0,2>, <5,6,7,4> + 3195445249U, // <6,6,2,5>: Cost 3 ins <6,u,2,5>, lane 1 + 2120826882U, // <6,6,2,6>: Cost 2 ins <6,6,u,6>, lane 2 + 1908706614U, // <6,6,2,7>: Cost 2 vzipr <4,u,6,2>, RHS + 1908706615U, // <6,6,2,u>: Cost 2 vzipr <4,u,6,2>, RHS + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 2846787238U, // <6,6,3,1>: Cost 3 vuzpr <4,6,4,6>, <2,3,0,1> + 3206111232U, // <6,6,3,2>: Cost 3 ins <u,6,3,2>, lane 0 + 1880178826U, // <6,6,3,3>: Cost 2 vzipr <0,1,6,3>, <0,1,6,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 2846787278U, // <6,6,3,5>: Cost 3 vuzpr <4,6,4,6>, <2,3,4,5> + 2120826882U, // <6,6,3,6>: Cost 2 ins <6,6,u,6>, lane 2 + 2132410368U, // <6,6,3,7>: Cost 2 ins <u,6,3,7>, lane 0 + 2132410368U, // <6,6,3,u>: Cost 2 ins <u,6,3,7>, lane 0 + 2846790288U, // <6,6,4,0>: Cost 3 vuzpr <4,6,4,6>, <6,4,6,0> + 3194527746U, // <6,6,4,1>: Cost 3 ins <6,6,u,1>, lane 2 + 2846788778U, // <6,6,4,2>: Cost 3 vuzpr <4,6,4,6>, <4,4,0,2> + 3195576321U, // <6,6,4,3>: Cost 3 ins <6,u,4,3>, lane 1 + 2047660134U, // <6,6,4,4>: Cost 2 vtrnr <5,6,7,4>, <5,6,7,4> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 1718029622U, // <6,6,4,6>: Cost 2 vuzpl <6,6,6,6>, RHS + 2120835074U, // <6,6,4,7>: Cost 2 ins <6,6,u,7>, lane 2 + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3194363907U, // <6,6,5,0>: Cost 3 ins <6,6,5,u>, lane 3 + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3206258688U, // <6,6,5,2>: Cost 3 ins <u,6,5,2>, lane 0 + 3194544130U, // <6,6,5,3>: Cost 3 ins <6,6,u,3>, lane 2 + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 1906740532U, // <6,6,5,5>: Cost 2 vzipr <4,5,6,5>, <4,5,6,5> + 2120826882U, // <6,6,5,6>: Cost 2 ins <6,6,u,6>, lane 2 + 1773047094U, // <6,6,5,7>: Cost 2 vuzpr <4,6,4,6>, RHS + 1773047095U, // <6,6,5,u>: Cost 2 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2120695811U, // <6,6,6,1>: Cost 2 ins <6,6,6,u>, lane 3 + 2120695811U, // <6,6,6,2>: Cost 2 ins <6,6,6,u>, lane 3 + 2120695811U, // <6,6,6,3>: Cost 2 ins <6,6,6,u>, lane 3 + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2120695811U, // <6,6,6,5>: Cost 2 ins <6,6,6,u>, lane 3 + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 1908739382U, // <6,6,6,7>: Cost 2 vzipr <4,u,6,6>, RHS + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2132647936U, // <6,6,7,0>: Cost 2 ins <u,6,7,0>, lane 0 + 2120769539U, // <6,6,7,1>: Cost 2 ins <6,6,7,u>, lane 3 + 1908747164U, // <6,6,7,2>: Cost 2 vzipr RHS, <4,0,6,2> + 2122055681U, // <6,6,7,3>: Cost 2 ins <6,u,7,3>, lane 1 + 2132680704U, // <6,6,7,4>: Cost 2 ins <u,6,7,4>, lane 0 + 2120769539U, // <6,6,7,5>: Cost 2 ins <6,6,7,u>, lane 3 + 1906758456U, // <6,6,7,6>: Cost 2 vzipr RHS, <6,6,6,6> + 833015094U, // <6,6,7,7>: Cost 1 vzipr RHS, RHS + 833015095U, // <6,6,7,u>: Cost 1 vzipr RHS, RHS + 2047627362U, // <6,6,u,0>: Cost 2 vtrnr <5,6,7,0>, <5,6,7,0> + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 1906764700U, // <6,6,u,2>: Cost 2 vzipr RHS, <4,0,6,2> + 1773044381U, // <6,6,u,3>: Cost 2 vuzpr <4,6,4,6>, LHS + 2047660134U, // <6,6,u,4>: Cost 2 vtrnr <5,6,7,4>, <5,6,7,4> + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 833023286U, // <6,6,u,7>: Cost 1 vzipr RHS, RHS + 833023287U, // <6,6,u,u>: Cost 1 vzipr RHS, RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2120916995U, // <6,7,0,3>: Cost 2 ins <6,7,0,u>, lane 3 + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2120916995U, // <6,7,0,6>: Cost 2 ins <6,7,0,u>, lane 3 + 2120916995U, // <6,7,0,7>: Cost 2 ins <6,7,0,u>, lane 3 + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1761034342U, // <6,7,1,3>: Cost 2 vuzpr <2,6,3,7>, LHS + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2121498626U, // <6,7,1,7>: Cost 2 ins <6,7,u,7>, lane 2 + 1761034347U, // <6,7,1,u>: Cost 2 vuzpr <2,6,3,7>, LHS + 2121064451U, // <6,7,2,0>: Cost 2 ins <6,7,2,u>, lane 3 + 2121449474U, // <6,7,2,1>: Cost 2 ins <6,7,u,1>, lane 2 + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1059889156U, // <6,7,2,3>: Cost 1 ins LHS, lane 4 + 2121064451U, // <6,7,2,4>: Cost 2 ins <6,7,2,u>, lane 3 + 2121482242U, // <6,7,2,5>: Cost 2 ins <6,7,u,5>, lane 2 + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2121498626U, // <6,7,2,7>: Cost 2 ins <6,7,u,7>, lane 2 + 1059889156U, // <6,7,2,u>: Cost 1 ins LHS, lane 4 + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2121449474U, // <6,7,3,1>: Cost 2 ins <6,7,u,1>, lane 2 + 2133696516U, // <6,7,3,2>: Cost 2 ins <u,u,3,2>, lane 4 + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2121482242U, // <6,7,3,5>: Cost 2 ins <6,7,u,5>, lane 2 + 2834777789U, // <6,7,3,6>: Cost 3 vuzpr <2,6,3,7>, <2,3,2,6> + 2133737476U, // <6,7,3,7>: Cost 2 ins <u,u,3,7>, lane 4 + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2121449474U, // <6,7,4,1>: Cost 2 ins <6,7,u,1>, lane 2 + 2121211907U, // <6,7,4,2>: Cost 2 ins <6,7,4,u>, lane 3 + 2121211907U, // <6,7,4,3>: Cost 2 ins <6,7,4,u>, lane 3 + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203276U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,0,2> + 2121211907U, // <6,7,4,7>: Cost 2 ins <6,7,4,u>, lane 3 + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2121465858U, // <6,7,5,3>: Cost 2 ins <6,7,u,3>, lane 2 + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1761037622U, // <6,7,5,7>: Cost 2 vuzpr <2,6,3,7>, RHS + 1761037623U, // <6,7,5,u>: Cost 2 vuzpr <2,6,3,7>, RHS + 2121359363U, // <6,7,6,0>: Cost 2 ins <6,7,6,u>, lane 3 + 2121449474U, // <6,7,6,1>: Cost 2 ins <6,7,u,1>, lane 2 + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2121465858U, // <6,7,6,3>: Cost 2 ins <6,7,u,3>, lane 2 + 2121359363U, // <6,7,6,4>: Cost 2 ins <6,7,6,u>, lane 3 + 2121482242U, // <6,7,6,5>: Cost 2 ins <6,7,u,5>, lane 2 + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1060216836U, // <6,7,6,7>: Cost 1 ins RHS, lane 4 + 1060216836U, // <6,7,6,u>: Cost 1 ins RHS, lane 4 + 1906757730U, // <6,7,7,0>: Cost 2 vzipr RHS, <5,6,7,0> + 2121449474U, // <6,7,7,1>: Cost 2 ins <6,7,u,1>, lane 2 + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 1906758138U, // <6,7,7,3>: Cost 2 vzipr RHS, <6,2,7,3> + 1906757734U, // <6,7,7,4>: Cost 2 vzipr RHS, <5,6,7,4> + 2121482242U, // <6,7,7,5>: Cost 2 ins <6,7,u,5>, lane 2 + 1906757574U, // <6,7,7,6>: Cost 2 vzipr RHS, <5,4,7,6> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1906757738U, // <6,7,7,u>: Cost 2 vzipr RHS, <5,6,7,u> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1059889156U, // <6,7,u,3>: Cost 1 ins LHS, lane 4 + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1060216836U, // <6,7,u,7>: Cost 1 ins RHS, lane 4 + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2047623837U, // <6,u,0,3>: Cost 2 vtrnr <5,6,7,0>, LHS + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 1986648218U, // <6,u,0,6>: Cost 2 vtrnl <6,7,0,1>, RHS + 2047626793U, // <6,u,0,7>: Cost 2 vtrnr <5,6,7,0>, RHS + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1761042534U, // <6,u,1,3>: Cost 2 vuzpr <2,6,3,u>, LHS + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2120826882U, // <6,u,1,6>: Cost 2 ins <6,6,u,6>, lane 2 + 2120835074U, // <6,u,1,7>: Cost 2 ins <6,6,u,7>, lane 2 + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 1849644846U, // <6,u,2,1>: Cost 2 vzipl <6,2,7,3>, LHS + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1055244288U, // <6,u,2,3>: Cost 1 ins LHS, lane 0 + 1504873876U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, <4,6,u,2> + 1849645210U, // <6,u,2,5>: Cost 2 vzipl <6,2,7,3>, RHS + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2041155113U, // <6,u,2,7>: Cost 2 vtrnr <4,6,0,2>, RHS + 1055244288U, // <6,u,2,u>: Cost 1 ins LHS, lane 0 + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2121449474U, // <6,u,3,1>: Cost 2 ins <6,7,u,1>, lane 2 + 2128388096U, // <6,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2121482242U, // <6,u,3,5>: Cost 2 ins <6,7,u,5>, lane 2 + 2120826882U, // <6,u,3,6>: Cost 2 ins <6,6,u,6>, lane 2 + 2131746816U, // <6,u,3,7>: Cost 2 ins <u,5,3,7>, lane 0 + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2121449474U, // <6,u,4,1>: Cost 2 ins <6,7,u,1>, lane 2 + 1986975534U, // <6,u,4,2>: Cost 2 vtrnl <6,7,4,5>, LHS + 2047656605U, // <6,u,4,3>: Cost 2 vtrnr <5,6,7,4>, LHS + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220812U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,0,2> + 2047659561U, // <6,u,4,7>: Cost 2 vtrnr <5,6,7,4>, RHS + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2118148098U, // <6,u,5,3>: Cost 2 ins <6,2,u,3>, lane 2 + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1761045814U, // <6,u,5,7>: Cost 2 vuzpr <2,6,3,u>, RHS + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 1852249902U, // <6,u,6,1>: Cost 2 vzipl <6,6,6,6>, LHS + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2041479837U, // <6,u,6,3>: Cost 2 vtrnr <4,6,4,6>, LHS + 1504906648U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, <4,6,u,6> + 1852250266U, // <6,u,6,5>: Cost 2 vzipl <6,6,6,6>, RHS + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1058226176U, // <6,u,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <6,u,6,u>: Cost 1 ins RHS, lane 0 + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 1906753609U, // <6,u,7,1>: Cost 2 vzipr RHS, <0,0,u,1> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 833011868U, // <6,u,7,3>: Cost 1 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 1906753937U, // <6,u,7,5>: Cost 2 vzipr RHS, <0,4,u,5> + 1906753776U, // <6,u,7,6>: Cost 2 vzipr RHS, <0,2,u,6> + 833015112U, // <6,u,7,7>: Cost 1 vzipr RHS, RHS + 833011873U, // <6,u,7,u>: Cost 1 vzipr RHS, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 833020060U, // <6,u,u,3>: Cost 1 vzipr RHS, LHS + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 833023304U, // <6,u,u,7>: Cost 1 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 2987152532U, // <7,0,0,3>: Cost 3 vzipr <5,6,7,0>, <7,2,0,3> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2987152210U, // <7,0,0,5>: Cost 3 vzipr <5,6,7,0>, <6,7,0,5> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 2987152050U, // <7,0,0,7>: Cost 3 vzipr <5,6,7,0>, <6,5,0,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2128232448U, // <7,0,1,1>: Cost 2 ins <u,0,1,1>, lane 0 + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2122317827U, // <7,0,1,3>: Cost 2 ins <7,0,1,u>, lane 3 + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2122317827U, // <7,0,1,5>: Cost 2 ins <7,0,1,u>, lane 3 + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2122317827U, // <7,0,1,7>: Cost 2 ins <7,0,1,u>, lane 3 + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2128314368U, // <7,0,2,2>: Cost 2 ins <u,0,2,2>, lane 0 + 2122833925U, // <7,0,2,3>: Cost 2 ins <7,0,u,u>, lane 5 + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2712060126U, // <7,0,2,6>: Cost 3 vext3 RHS, <0,2,6,6> + 3201433601U, // <7,0,2,7>: Cost 3 ins <7,u,2,7>, lane 1 + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2983854080U, // <7,0,3,0>: Cost 3 vzipr <5,1,7,3>, <0,0,0,0> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2128388096U, // <7,0,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 3196559362U, // <7,0,3,6>: Cost 3 ins <7,0,u,6>, lane 2 + 3201507329U, // <7,0,3,7>: Cost 3 ins <7,u,3,7>, lane 1 + 2128388096U, // <7,0,3,u>: Cost 2 ins <u,0,3,2>, lane 0 + 2712060230U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,2> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3201548289U, // <7,0,4,3>: Cost 3 ins <7,u,4,3>, lane 1 + 2712060269U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,5> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606348U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,0,2> + 3201581057U, // <7,0,4,7>: Cost 3 ins <7,u,4,7>, lane 1 + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2647625340U, // <7,0,5,0>: Cost 3 vext2 <5,0,7,0>, <5,0,7,0> + 2128527360U, // <7,0,5,1>: Cost 2 ins <u,0,5,1>, lane 0 + 1991032934U, // <7,0,5,2>: Cost 2 vtrnl <7,4,5,6>, LHS + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2847477046U, // <7,0,5,7>: Cost 3 vuzpr <4,7,5,0>, RHS + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2985869312U, // <7,0,6,0>: Cost 3 vzipr <5,4,7,6>, <0,0,0,0> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2128609280U, // <7,0,6,2>: Cost 2 ins <u,0,6,2>, lane 0 + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3202367488U, // <7,0,6,4>: Cost 3 ins <u,0,6,4>, lane 0 + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2122833925U, // <7,0,6,7>: Cost 2 ins <7,0,u,u>, lane 5 + 2128609280U, // <7,0,6,u>: Cost 2 ins <u,0,6,2>, lane 0 + 2847477192U, // <7,0,7,0>: Cost 3 vuzpr <4,7,5,0>, <4,7,5,0> + 1858961510U, // <7,0,7,1>: Cost 2 vzipl <7,7,7,7>, LHS + 1993179238U, // <7,0,7,2>: Cost 2 vtrnl <7,7,7,7>, LHS + 3201769473U, // <7,0,7,3>: Cost 3 ins <7,u,7,3>, lane 1 + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2128060417U, // <7,0,7,7>: Cost 2 ins <7,u,7,7>, lane 1 + 1858962077U, // <7,0,7,u>: Cost 2 vzipl <7,7,7,7>, LHS + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2122317827U, // <7,0,u,3>: Cost 2 ins <7,0,1,u>, lane 3 + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2122317827U, // <7,0,u,7>: Cost 2 ins <7,0,1,u>, lane 3 + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2712060634U, // <7,1,0,0>: Cost 3 vext3 RHS, <1,0,0,1> + 2128822272U, // <7,1,0,1>: Cost 2 ins <u,1,0,1>, lane 0 + 1719615590U, // <7,1,0,2>: Cost 2 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2859062268U, // <7,1,0,4>: Cost 3 vuzpr <6,7,0,1>, <7,0,1,4> + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2859061568U, // <7,1,0,6>: Cost 3 vuzpr <6,7,0,1>, <6,0,4,6> + 3201286145U, // <7,1,0,7>: Cost 3 ins <7,u,0,7>, lane 1 + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060714U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,0> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 2127577089U, // <7,1,1,2>: Cost 2 ins <7,u,1,2>, lane 1 + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 2859057294U, // <7,1,1,7>: Cost 3 vuzpr <6,7,0,1>, <0,1,6,7> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2128961536U, // <7,1,2,0>: Cost 2 ins <u,1,2,0>, lane 0 + 2128969728U, // <7,1,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 2128977920U, // <7,1,2,2>: Cost 2 ins <u,1,2,2>, lane 0 + 1055244288U, // <7,1,2,3>: Cost 1 ins LHS, lane 0 + 2128994304U, // <7,1,2,4>: Cost 2 ins <u,1,2,4>, lane 0 + 2129002496U, // <7,1,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2129010688U, // <7,1,2,6>: Cost 2 ins <u,1,2,6>, lane 0 + 2129018880U, // <7,1,2,7>: Cost 2 ins <u,1,2,7>, lane 0 + 1055244288U, // <7,1,2,u>: Cost 1 ins LHS, lane 0 + 1510998118U, // <7,1,3,0>: Cost 2 vext1 <5,7,1,3>, LHS + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2047869030U, // <7,1,3,3>: Cost 2 vtrnr <5,7,1,3>, LHS + 1511001398U, // <7,1,3,4>: Cost 2 vext1 <5,7,1,3>, RHS + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2983859604U, // <7,1,3,7>: Cost 3 vzipr <5,1,7,3>, <7,5,1,7> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2129133568U, // <7,1,4,3>: Cost 2 ins <u,1,4,3>, lane 0 + 2859060432U, // <7,1,4,4>: Cost 3 vuzpr <6,7,0,1>, <4,4,4,4> + 2129149952U, // <7,1,4,5>: Cost 2 ins <u,1,4,5>, lane 0 + 1719618870U, // <7,1,4,6>: Cost 2 vuzpl <7,0,1,2>, RHS + 2793360778U, // <7,1,4,7>: Cost 3 vuzpl <7,0,1,2>, <4,6,7,1> + 1719618888U, // <7,1,4,u>: Cost 2 vuzpl <7,0,1,2>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3202940928U, // <7,1,5,2>: Cost 3 ins <u,1,5,2>, lane 0 + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 2985861458U, // <7,1,5,5>: Cost 3 vzipr <5,4,7,5>, <0,4,1,5> + 2127904769U, // <7,1,5,6>: Cost 2 ins <7,u,5,6>, lane 1 + 1785318710U, // <7,1,5,7>: Cost 2 vuzpr <6,7,0,1>, RHS + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 2653606230U, // <7,1,6,0>: Cost 3 vext2 <6,0,7,1>, <6,0,7,1> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2129281024U, // <7,1,6,3>: Cost 2 ins <u,1,6,3>, lane 0 + 2859061350U, // <7,1,6,4>: Cost 3 vuzpr <6,7,0,1>, <5,6,7,4> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 2859060596U, // <7,1,6,6>: Cost 3 vuzpr <6,7,0,1>, <4,6,4,6> + 2129313792U, // <7,1,6,7>: Cost 2 ins <u,1,6,7>, lane 0 + 2129281024U, // <7,1,6,u>: Cost 2 ins <u,1,6,3>, lane 0 + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 1785320270U, // <7,1,7,1>: Cost 2 vuzpr <6,7,0,1>, <6,7,0,1> + 2986543254U, // <7,1,7,2>: Cost 3 vzipr <5,5,7,7>, <3,0,1,2> + 2048196710U, // <7,1,7,3>: Cost 2 vtrnr <5,7,5,7>, LHS + 2793362538U, // <7,1,7,4>: Cost 3 vuzpl <7,0,1,2>, <7,1,4,6> + 2986541394U, // <7,1,7,5>: Cost 3 vzipr <5,5,7,7>, <0,4,1,5> + 3201794049U, // <7,1,7,6>: Cost 3 ins <7,u,7,6>, lane 1 + 2128060417U, // <7,1,7,7>: Cost 2 ins <7,u,7,7>, lane 1 + 2048196715U, // <7,1,7,u>: Cost 2 vtrnr <5,7,5,7>, LHS + 1511039078U, // <7,1,u,0>: Cost 2 vext1 <5,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 1719621422U, // <7,1,u,2>: Cost 2 vuzpl <7,0,1,2>, LHS + 1055244288U, // <7,1,u,3>: Cost 1 ins LHS, lane 0 + 1511042358U, // <7,1,u,4>: Cost 2 vext1 <5,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 1719621786U, // <7,1,u,6>: Cost 2 vuzpl <7,0,1,2>, RHS + 1785318953U, // <7,1,u,7>: Cost 2 vuzpr <6,7,0,1>, RHS + 1055244288U, // <7,1,u,u>: Cost 1 ins LHS, lane 0 + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2129494016U, // <7,2,0,2>: Cost 2 ins <u,2,0,2>, lane 0 + 1913405542U, // <7,2,0,3>: Cost 2 vzipr <5,6,7,0>, LHS + 2712061400U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,2> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 2927577066U, // <7,2,0,7>: Cost 3 vzipl <7,0,1,2>, <2,7,0,1> + 1913405547U, // <7,2,0,u>: Cost 2 vzipr <5,6,7,0>, LHS + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3203301376U, // <7,2,1,1>: Cost 3 ins <u,2,1,1>, lane 0 + 2127577089U, // <7,2,1,2>: Cost 2 ins <7,u,1,2>, lane 1 + 2974548070U, // <7,2,1,3>: Cost 3 vzipr <3,5,7,1>, LHS + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3203334144U, // <7,2,1,5>: Cost 3 ins <u,2,1,5>, lane 0 + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2127577089U, // <7,2,1,u>: Cost 2 ins <7,u,1,2>, lane 1 + 2712061524U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,0> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061564U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,4> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061581U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,3> + 3201433601U, // <7,2,2,7>: Cost 3 ins <7,u,2,7>, lane 1 + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 1638319802U, // <7,2,3,2>: Cost 2 vext3 RHS, <2,3,2,3> + 1910112358U, // <7,2,3,3>: Cost 2 vzipr <5,1,7,3>, LHS + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 1625048802U, // <7,2,3,6>: Cost 2 vext3 <2,3,6,7>, <2,3,6,7> + 2990495214U, // <7,2,3,7>: Cost 3 vzipr <6,2,7,3>, <7,6,2,7> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061688U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,2> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 1913438310U, // <7,2,4,3>: Cost 2 vzipr <5,6,7,4>, LHS + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2129821696U, // <7,2,4,6>: Cost 2 ins <u,2,4,6>, lane 0 + 3201581057U, // <7,2,4,7>: Cost 3 ins <7,u,4,7>, lane 1 + 1913438315U, // <7,2,4,u>: Cost 2 vzipr <5,6,7,4>, LHS + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3203596288U, // <7,2,5,1>: Cost 3 ins <u,2,5,1>, lane 0 + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3203629056U, // <7,2,5,5>: Cost 3 ins <u,2,5,5>, lane 0 + 2127904769U, // <7,2,5,6>: Cost 2 ins <7,u,5,6>, lane 1 + 2853096758U, // <7,2,5,7>: Cost 3 vuzpr <5,7,0,2>, RHS + 2127904769U, // <7,2,5,u>: Cost 2 ins <7,u,5,6>, lane 1 + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 2129977344U, // <7,2,6,7>: Cost 2 ins <u,2,6,7>, lane 0 + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 3121939350U, // <7,2,7,0>: Cost 3 vtrnr <5,7,5,7>, <1,2,3,0> + 3203743744U, // <7,2,7,1>: Cost 3 ins <u,2,7,1>, lane 0 + 1720366165U, // <7,2,7,2>: Cost 2 vuzpl <7,1,2,3>, <7,1,2,3> + 1912799334U, // <7,2,7,3>: Cost 2 vzipr <5,5,7,7>, LHS + 3121939354U, // <7,2,7,4>: Cost 3 vtrnr <5,7,5,7>, <1,2,3,4> + 3203776512U, // <7,2,7,5>: Cost 3 ins <u,2,7,5>, lane 0 + 2986541404U, // <7,2,7,6>: Cost 3 vzipr <5,5,7,7>, <0,4,2,6> + 2128060417U, // <7,2,7,7>: Cost 2 ins <7,u,7,7>, lane 1 + 1912799339U, // <7,2,7,u>: Cost 2 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 2129494016U, // <7,2,u,2>: Cost 2 ins <u,2,0,2>, lane 0 + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2129821696U, // <7,2,u,6>: Cost 2 ins <u,2,4,6>, lane 0 + 2129977344U, // <7,2,u,7>: Cost 2 ins <u,2,6,7>, lane 0 + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2712062119U, // <7,3,0,3>: Cost 3 vext3 RHS, <3,0,3,1> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 2985157776U, // <7,3,0,7>: Cost 3 vzipr <5,3,7,0>, <1,5,3,7> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2127577089U, // <7,3,1,2>: Cost 2 ins <7,u,1,2>, lane 1 + 1779433574U, // <7,3,1,3>: Cost 2 vuzpr <5,7,1,3>, LHS + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 2853179064U, // <7,3,1,6>: Cost 3 vuzpr <5,7,1,3>, <5,1,4,6> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 1779433579U, // <7,3,1,u>: Cost 2 vuzpr <5,7,1,3>, LHS + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2130313216U, // <7,3,2,3>: Cost 2 ins <u,3,2,3>, lane 0 + 2712062292U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,3> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2130313216U, // <7,3,2,u>: Cost 2 ins <u,3,2,3>, lane 0 + 2712062334U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,0> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2990491658U, // <7,3,3,6>: Cost 3 vzipr <6,2,7,3>, <2,7,3,6> + 2972574864U, // <7,3,3,7>: Cost 3 vzipr <3,2,7,3>, <1,5,3,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2987180790U, // <7,3,4,2>: Cost 3 vzipr <5,6,7,4>, <1,0,3,2> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062455U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,4> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313164U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,0,2> + 2985190544U, // <7,3,4,7>: Cost 3 vzipr <5,3,7,4>, <1,5,3,7> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2712062498U, // <7,3,5,0>: Cost 3 vext3 RHS, <3,5,0,2> + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2127904769U, // <7,3,5,6>: Cost 2 ins <7,u,5,6>, lane 1 + 1779436854U, // <7,3,5,7>: Cost 2 vuzpr <5,7,1,3>, RHS + 1779436855U, // <7,3,5,u>: Cost 2 vuzpr <5,7,1,3>, RHS + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2853178744U, // <7,3,6,1>: Cost 3 vuzpr <5,7,1,3>, <4,6,5,1> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3204366336U, // <7,3,6,5>: Cost 3 ins <u,3,6,5>, lane 0 + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2130640896U, // <7,3,6,7>: Cost 2 ins <u,3,6,7>, lane 0 + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 1779437696U, // <7,3,7,3>: Cost 2 vuzpr <5,7,1,3>, <5,7,1,3> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2237582070U, // <7,3,7,5>: Cost 3 vrev <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2128060417U, // <7,3,7,7>: Cost 2 ins <7,u,7,7>, lane 1 + 1779437696U, // <7,3,7,u>: Cost 2 vuzpr <5,7,1,3>, <5,7,1,3> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1779434141U, // <7,3,u,3>: Cost 2 vuzpr <5,7,1,3>, LHS + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2127904769U, // <7,3,u,6>: Cost 2 ins <7,u,5,6>, lane 1 + 1779437097U, // <7,3,u,7>: Cost 2 vuzpr <5,7,1,3>, RHS + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2714053478U, // <7,4,0,0>: Cost 3 vext3 RHS, <4,0,0,2> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3201253377U, // <7,4,0,3>: Cost 3 ins <7,u,0,3>, lane 1 + 2714053512U, // <7,4,0,4>: Cost 3 vext3 RHS, <4,0,4,0> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 2927578568U, // <7,4,0,7>: Cost 3 vzipl <7,0,1,2>, <4,7,5,0> + 1640311726U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,2> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2127577089U, // <7,4,1,2>: Cost 2 ins <7,u,1,2>, lane 1 + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 3127495888U, // <7,4,1,4>: Cost 3 vtrnr <6,7,0,1>, <4,4,4,4> + 2130919424U, // <7,4,1,5>: Cost 2 ins <u,4,1,5>, lane 0 + 1988054326U, // <7,4,1,6>: Cost 2 vtrnl <7,0,1,2>, RHS + 3061796234U, // <7,4,1,7>: Cost 3 vtrnl <7,0,1,2>, <4,6,7,1> + 1988054344U, // <7,4,1,u>: Cost 2 vtrnl <7,0,1,2>, RHS + 3204694016U, // <7,4,2,0>: Cost 3 ins <u,4,2,0>, lane 0 + 3199172610U, // <7,4,2,1>: Cost 3 ins <7,4,u,1>, lane 2 + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2125488133U, // <7,4,2,3>: Cost 2 ins <7,4,u,u>, lane 5 + 2853258138U, // <7,4,2,4>: Cost 3 vuzpr <5,7,2,4>, <1,2,3,4> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2131001344U, // <7,4,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 3201433601U, // <7,4,2,7>: Cost 3 ins <7,u,2,7>, lane 1 + 2125488133U, // <7,4,2,u>: Cost 2 ins <7,4,u,u>, lane 5 + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3201458177U, // <7,4,3,1>: Cost 3 ins <7,u,3,1>, lane 1 + 3204784128U, // <7,4,3,2>: Cost 3 ins <u,4,3,2>, lane 0 + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2983857360U, // <7,4,3,4>: Cost 3 vzipr <5,1,7,3>, <4,4,4,4> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2125471746U, // <7,4,3,6>: Cost 2 ins <7,4,u,6>, lane 2 + 3201507329U, // <7,4,3,7>: Cost 3 ins <7,u,3,7>, lane 1 + 2125471746U, // <7,4,3,u>: Cost 2 ins <7,4,u,6>, lane 2 + 2714053800U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,0> + 3201531905U, // <7,4,4,1>: Cost 3 ins <7,u,4,1>, lane 1 + 3201540097U, // <7,4,4,2>: Cost 3 ins <7,u,4,2>, lane 1 + 2987185336U, // <7,4,4,3>: Cost 3 vzipr <5,6,7,4>, <7,2,4,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2987185664U, // <7,4,4,7>: Cost 3 vzipr <5,6,7,4>, <7,6,4,7> + 1640312054U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,6> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2125266947U, // <7,4,5,1>: Cost 2 ins <7,4,5,u>, lane 3 + 2125266947U, // <7,4,5,2>: Cost 2 ins <7,4,5,u>, lane 3 + 2125266947U, // <7,4,5,3>: Cost 2 ins <7,4,5,u>, lane 3 + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2131214336U, // <7,4,5,5>: Cost 2 ins <u,4,5,5>, lane 0 + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2125266947U, // <7,4,5,7>: Cost 2 ins <7,4,5,u>, lane 3 + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 1638468940U, // <7,4,6,0>: Cost 2 vext3 RHS, <4,6,0,2> + 2712063318U, // <7,4,6,1>: Cost 3 vext3 RHS, <4,6,1,3> + 2712210780U, // <7,4,6,2>: Cost 3 vext3 RHS, <4,6,2,0> + 2712210790U, // <7,4,6,3>: Cost 3 vext3 RHS, <4,6,3,1> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2131296256U, // <7,4,6,6>: Cost 2 ins <u,4,6,6>, lane 0 + 2125488133U, // <7,4,6,7>: Cost 2 ins <7,4,u,u>, lane 5 + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 2794279930U, // <7,4,7,1>: Cost 3 vuzpl <7,1,4,6>, <7,0,1,2> + 3201761281U, // <7,4,7,2>: Cost 3 ins <7,u,7,2>, lane 1 + 3201769473U, // <7,4,7,3>: Cost 3 ins <7,u,7,3>, lane 1 + 2847509964U, // <7,4,7,4>: Cost 3 vuzpr <4,7,5,4>, <4,7,5,4> + 1858964790U, // <7,4,7,5>: Cost 2 vzipl <7,7,7,7>, RHS + 1993182518U, // <7,4,7,6>: Cost 2 vtrnl <7,7,7,7>, RHS + 2128060417U, // <7,4,7,7>: Cost 2 ins <7,u,7,7>, lane 1 + 1858965033U, // <7,4,7,u>: Cost 2 vzipl <7,7,7,7>, RHS + 1640312302U, // <7,4,u,0>: Cost 2 vext3 RHS, <4,u,0,2> + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2127577089U, // <7,4,u,2>: Cost 2 ins <7,u,1,2>, lane 1 + 2125488133U, // <7,4,u,3>: Cost 2 ins <7,4,u,u>, lane 5 + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2125266947U, // <7,4,u,7>: Cost 2 ins <7,4,5,u>, lane 3 + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2131476480U, // <7,5,0,1>: Cost 2 ins <u,5,0,1>, lane 0 + 1722597478U, // <7,5,0,2>: Cost 2 vuzpl <7,4,5,6>, LHS + 3201253377U, // <7,5,0,3>: Cost 3 ins <7,u,0,3>, lane 1 + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2987150554U, // <7,5,0,5>: Cost 3 vzipr <5,6,7,0>, <4,4,5,5> + 2987149826U, // <7,5,0,6>: Cost 3 vzipr <5,6,7,0>, <3,4,5,6> + 2131525632U, // <7,5,0,7>: Cost 2 ins <u,5,0,7>, lane 0 + 1722597532U, // <7,5,0,u>: Cost 2 vuzpl <7,4,5,6>, LHS + 2714054287U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 2249183358U, // <7,5,1,1>: Cost 3 vrev <5,7,1,1> + 2127577089U, // <7,5,1,2>: Cost 2 ins <7,u,1,2>, lane 1 + 1785643110U, // <7,5,1,3>: Cost 2 vuzpr <6,7,4,5>, LHS + 2714054327U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 3127496708U, // <7,5,1,5>: Cost 3 vtrnr <6,7,0,1>, <5,5,5,5> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2249117814U, // <7,5,2,0>: Cost 3 vrev <5,7,0,2> + 2714054379U, // <7,5,2,1>: Cost 3 vext3 RHS, <5,2,1,3> + 2249265288U, // <7,5,2,2>: Cost 3 vrev <5,7,2,2> + 2131640320U, // <7,5,2,3>: Cost 2 ins <u,5,2,3>, lane 0 + 2859385754U, // <7,5,2,4>: Cost 3 vuzpr <6,7,4,5>, <1,2,3,4> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2712063768U, // <7,5,2,6>: Cost 3 vext3 RHS, <5,2,6,3> + 2131673088U, // <7,5,2,7>: Cost 2 ins <u,5,2,7>, lane 0 + 2131640320U, // <7,5,2,u>: Cost 2 ins <u,5,2,3>, lane 0 + 3201449985U, // <7,5,3,0>: Cost 3 ins <7,u,3,0>, lane 1 + 1175457920U, // <7,5,3,1>: Cost 2 vrev <5,7,1,3> + 2249273481U, // <7,5,3,2>: Cost 3 vrev <5,7,2,3> + 2249347218U, // <7,5,3,3>: Cost 3 vrev <5,7,3,3> + 3201482753U, // <7,5,3,4>: Cost 3 ins <7,u,3,4>, lane 1 + 2983857370U, // <7,5,3,5>: Cost 3 vzipr <5,1,7,3>, <4,4,5,5> + 2983856642U, // <7,5,3,6>: Cost 3 vzipr <5,1,7,3>, <3,4,5,6> + 2047872310U, // <7,5,3,7>: Cost 2 vtrnr <5,7,1,3>, RHS + 2047872311U, // <7,5,3,u>: Cost 2 vtrnr <5,7,1,3>, RHS + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 2987182994U, // <7,5,4,1>: Cost 3 vzipr <5,6,7,4>, <4,0,5,1> + 2249281674U, // <7,5,4,2>: Cost 3 vrev <5,7,2,4> + 3201548289U, // <7,5,4,3>: Cost 3 ins <7,u,4,3>, lane 1 + 2579074508U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, <4,7,5,4> + 2131804160U, // <7,5,4,5>: Cost 2 ins <u,5,4,5>, lane 0 + 1722600758U, // <7,5,4,6>: Cost 2 vuzpl <7,4,5,6>, RHS + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2714054620U, // <7,5,5,1>: Cost 3 vext3 RHS, <5,5,1,1> + 3201613825U, // <7,5,5,2>: Cost 3 ins <7,u,5,2>, lane 1 + 2649657204U, // <7,5,5,3>: Cost 3 vext2 <5,3,7,5>, <5,3,7,5> + 2714054651U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,5> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2127904769U, // <7,5,5,6>: Cost 2 ins <7,u,5,6>, lane 1 + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2131910656U, // <7,5,6,0>: Cost 2 ins <u,5,6,0>, lane 0 + 2131918848U, // <7,5,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 2131927040U, // <7,5,6,2>: Cost 2 ins <u,5,6,2>, lane 0 + 2131935232U, // <7,5,6,3>: Cost 2 ins <u,5,6,3>, lane 0 + 2131943424U, // <7,5,6,4>: Cost 2 ins <u,5,6,4>, lane 0 + 2131951616U, // <7,5,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 2131959808U, // <7,5,6,6>: Cost 2 ins <u,5,6,6>, lane 0 + 1058226176U, // <7,5,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <7,5,6,u>: Cost 1 ins RHS, lane 0 + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 1638469760U, // <7,5,7,1>: Cost 2 vext3 RHS, <5,7,1,3> + 2712211590U, // <7,5,7,2>: Cost 3 vext3 RHS, <5,7,2,0> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2048199990U, // <7,5,7,7>: Cost 2 vtrnr <5,7,5,7>, RHS + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 1638469841U, // <7,5,u,1>: Cost 2 vext3 RHS, <5,u,1,3> + 1722603310U, // <7,5,u,2>: Cost 2 vuzpl <7,4,5,6>, LHS + 1785643677U, // <7,5,u,3>: Cost 2 vuzpr <6,7,4,5>, LHS + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 1722603674U, // <7,5,u,6>: Cost 2 vuzpl <7,4,5,6>, RHS + 1058226176U, // <7,5,u,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <7,5,u,u>: Cost 1 ins RHS, lane 0 + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2132148224U, // <7,6,0,2>: Cost 2 ins <u,6,0,2>, lane 0 + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2987151292U, // <7,6,0,5>: Cost 3 vzipr <5,6,7,0>, <5,4,6,5> + 2987150564U, // <7,6,0,6>: Cost 3 vzipr <5,6,7,0>, <4,4,6,6> + 1913408822U, // <7,6,0,7>: Cost 2 vzipr <5,6,7,0>, RHS + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2127577089U, // <7,6,1,2>: Cost 2 ins <7,u,1,2>, lane 1 + 2841329766U, // <7,6,1,3>: Cost 3 vuzpr <3,7,2,6>, LHS + 2579123666U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, <4,7,6,1> + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2974551350U, // <7,6,1,7>: Cost 3 vzipr <3,5,7,1>, RHS + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2714055117U, // <7,6,2,2>: Cost 3 vext3 RHS, <6,2,2,3> + 2132303872U, // <7,6,2,3>: Cost 2 ins <u,6,2,3>, lane 0 + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2714055152U, // <7,6,2,6>: Cost 3 vext3 RHS, <6,2,6,2> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 3121614200U, // <7,6,3,1>: Cost 3 vtrnr <5,7,1,3>, <4,6,5,1> + 1181504354U, // <7,6,3,2>: Cost 2 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 3206135808U, // <7,6,3,5>: Cost 3 ins <u,6,3,5>, lane 0 + 2983857380U, // <7,6,3,6>: Cost 3 vzipr <5,1,7,3>, <4,4,6,6> + 1910115638U, // <7,6,3,7>: Cost 2 vzipr <5,1,7,3>, RHS + 1910115639U, // <7,6,3,u>: Cost 2 vzipr <5,1,7,3>, RHS + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2714055276U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,0> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2650328272U, // <7,6,4,4>: Cost 3 vext2 <5,4,7,6>, <4,4,4,4> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2132475904U, // <7,6,4,6>: Cost 2 ins <u,6,4,6>, lane 0 + 1913441590U, // <7,6,4,7>: Cost 2 vzipr <5,6,7,4>, RHS + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3201622017U, // <7,6,5,3>: Cost 3 ins <7,u,5,3>, lane 1 + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2127904769U, // <7,6,5,6>: Cost 2 ins <7,u,5,6>, lane 1 + 2971929910U, // <7,6,5,7>: Cost 3 vzipr <3,1,7,5>, RHS + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2712212245U, // <7,6,6,2>: Cost 3 vext3 RHS, <6,6,2,7> + 3201695745U, // <7,6,6,3>: Cost 3 ins <7,u,6,3>, lane 1 + 2714055461U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,5> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 1638323042U, // <7,6,7,2>: Cost 2 vext3 RHS, <6,7,2,3> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 1638323082U, // <7,6,7,6>: Cost 2 vext3 RHS, <6,7,6,7> + 1912802614U, // <7,6,7,7>: Cost 2 vzipr <5,5,7,7>, RHS + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2132148224U, // <7,6,u,2>: Cost 2 ins <u,6,0,2>, lane 0 + 2132303872U, // <7,6,u,3>: Cost 2 ins <u,6,2,3>, lane 0 + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2132475904U, // <7,6,u,6>: Cost 2 ins <u,6,4,6>, lane 0 + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 1913409634U, // <7,7,0,0>: Cost 2 vzipr <5,6,7,0>, <5,6,7,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 1724743782U, // <7,7,0,2>: Cost 2 vuzpl <7,7,7,7>, LHS + 2987151056U, // <7,7,0,3>: Cost 3 vzipr <5,6,7,0>, <5,1,7,3> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2987151302U, // <7,7,0,6>: Cost 3 vzipr <5,6,7,0>, <5,4,7,6> + 2127470594U, // <7,7,0,7>: Cost 2 ins <7,7,u,7>, lane 2 + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2053755726U, // <7,7,1,1>: Cost 2 vtrnr <6,7,0,1>, <6,7,0,1> + 2127577089U, // <7,7,1,2>: Cost 2 ins <7,u,1,2>, lane 1 + 1779761254U, // <7,7,1,3>: Cost 2 vuzpr <5,7,5,7>, LHS + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2127470594U, // <7,7,1,7>: Cost 2 ins <7,7,u,7>, lane 2 + 1779761259U, // <7,7,1,u>: Cost 2 vuzpr <5,7,5,7>, LHS + 2853503894U, // <7,7,2,0>: Cost 3 vuzpr <5,7,5,7>, <1,2,3,0> + 3206692864U, // <7,7,2,1>: Cost 3 ins <u,7,2,1>, lane 0 + 1988801621U, // <7,7,2,2>: Cost 2 vtrnl <7,1,2,3>, <7,1,2,3> + 2132967424U, // <7,7,2,3>: Cost 2 ins <u,7,2,3>, lane 0 + 2853503898U, // <7,7,2,4>: Cost 3 vuzpr <5,7,5,7>, <1,2,3,4> + 3206725632U, // <7,7,2,5>: Cost 3 ins <u,7,2,5>, lane 0 + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2127470594U, // <7,7,2,7>: Cost 2 ins <7,7,u,7>, lane 2 + 1988801621U, // <7,7,2,u>: Cost 2 vtrnl <7,1,2,3>, <7,1,2,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3121615694U, // <7,7,3,1>: Cost 3 vtrnr <5,7,1,3>, <6,7,0,1> + 3201171458U, // <7,7,3,2>: Cost 3 ins <7,7,u,2>, lane 2 + 1910116048U, // <7,7,3,3>: Cost 2 vzipr <5,1,7,3>, <5,1,7,3> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2639055462U, // <7,7,3,5>: Cost 3 vext2 <3,5,7,7>, <3,5,7,7> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2127470594U, // <7,7,3,7>: Cost 2 ins <7,7,u,7>, lane 2 + 1910116048U, // <7,7,3,u>: Cost 2 vzipr <5,1,7,3>, <5,1,7,3> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3062715386U, // <7,7,4,1>: Cost 3 vtrnl <7,1,4,6>, <7,0,1,2> + 3201540097U, // <7,7,4,2>: Cost 3 ins <7,u,4,2>, lane 1 + 2987183824U, // <7,7,4,3>: Cost 3 vzipr <5,6,7,4>, <5,1,7,3> + 1913442406U, // <7,7,4,4>: Cost 2 vzipr <5,6,7,4>, <5,6,7,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 1724747062U, // <7,7,4,6>: Cost 2 vuzpl <7,7,7,7>, RHS + 2127470594U, // <7,7,4,7>: Cost 2 ins <7,7,u,7>, lane 2 + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2853508547U, // <7,7,5,0>: Cost 3 vuzpr <5,7,5,7>, <7,5,7,0> + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3201613825U, // <7,7,5,2>: Cost 3 ins <7,u,5,2>, lane 1 + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2127904769U, // <7,7,5,6>: Cost 2 ins <7,u,5,6>, lane 1 + 1779764534U, // <7,7,5,7>: Cost 2 vuzpr <5,7,5,7>, RHS + 1779764535U, // <7,7,5,u>: Cost 2 vuzpr <5,7,5,7>, RHS + 2985873506U, // <7,7,6,0>: Cost 3 vzipr <5,4,7,6>, <5,6,7,0> + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2985873104U, // <7,7,6,3>: Cost 3 vzipr <5,4,7,6>, <5,1,7,3> + 2985873510U, // <7,7,6,4>: Cost 3 vzipr <5,4,7,6>, <5,6,7,4> + 2985873511U, // <7,7,6,5>: Cost 3 vzipr <5,4,7,6>, <5,6,7,5> + 1912131526U, // <7,7,6,6>: Cost 2 vzipr <5,4,7,6>, <5,4,7,6> + 2133295104U, // <7,7,6,7>: Cost 2 ins <u,7,6,7>, lane 0 + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2127405059U, // <7,7,7,1>: Cost 2 ins <7,7,7,u>, lane 3 + 2127405059U, // <7,7,7,2>: Cost 2 ins <7,7,7,u>, lane 3 + 2127405059U, // <7,7,7,3>: Cost 2 ins <7,7,7,u>, lane 3 + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2127405059U, // <7,7,7,5>: Cost 2 ins <7,7,7,u>, lane 3 + 2127405059U, // <7,7,7,6>: Cost 2 ins <7,7,7,u>, lane 3 + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1913409634U, // <7,7,u,0>: Cost 2 vzipr <5,6,7,0>, <5,6,7,0> + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 1724749614U, // <7,7,u,2>: Cost 2 vuzpl <7,7,7,7>, LHS + 1779761821U, // <7,7,u,3>: Cost 2 vuzpr <5,7,5,7>, LHS + 1913442406U, // <7,7,u,4>: Cost 2 vzipr <5,6,7,4>, <5,6,7,4> + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1724749978U, // <7,7,u,6>: Cost 2 vuzpl <7,7,7,7>, RHS + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1720131686U, // <7,u,0,2>: Cost 2 vuzpl <7,0,u,2>, LHS + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1853839514U, // <7,u,0,5>: Cost 2 vzipl <7,0,1,2>, RHS + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1913408840U, // <7,u,0,7>: Cost 2 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 2128232448U, // <7,u,1,1>: Cost 2 ins <u,0,1,1>, lane 0 + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2122317827U, // <7,u,1,5>: Cost 2 ins <7,0,1,u>, lane 3 + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1662211948U, // <7,u,2,0>: Cost 2 vext3 RHS, <u,2,0,2> + 2128969728U, // <7,u,2,1>: Cost 2 ins <u,1,2,1>, lane 0 + 2128314368U, // <7,u,2,2>: Cost 2 ins <u,0,2,2>, lane 0 + 1055244288U, // <7,u,2,3>: Cost 1 ins LHS, lane 0 + 1662211988U, // <7,u,2,4>: Cost 2 vext3 RHS, <u,2,4,6> + 2129002496U, // <7,u,2,5>: Cost 2 ins <u,1,2,5>, lane 0 + 2131001344U, // <7,u,2,6>: Cost 2 ins <u,4,2,6>, lane 0 + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1055244288U, // <7,u,2,u>: Cost 1 ins LHS, lane 0 + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638324167U, // <7,u,3,1>: Cost 2 vext3 RHS, <u,3,1,3> + 2128388096U, // <7,u,3,2>: Cost 2 ins <u,0,3,2>, lane 0 + 1910112412U, // <7,u,3,3>: Cost 2 vzipr <5,1,7,3>, LHS + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2125471746U, // <7,u,3,6>: Cost 2 ins <7,4,u,6>, lane 2 + 1910115656U, // <7,u,3,7>: Cost 2 vzipr <5,1,7,3>, RHS + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1856821038U, // <7,u,4,1>: Cost 2 vzipl <7,4,5,6>, LHS + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 1913438364U, // <7,u,4,3>: Cost 2 vzipr <5,6,7,4>, LHS + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1720134966U, // <7,u,4,6>: Cost 2 vuzpl <7,0,u,2>, RHS + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 1991038766U, // <7,u,5,2>: Cost 2 vtrnl <7,4,5,6>, LHS + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 1662359728U, // <7,u,6,0>: Cost 2 vext3 RHS, <u,6,0,2> + 2131918848U, // <7,u,6,1>: Cost 2 ins <u,5,6,1>, lane 0 + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1662359768U, // <7,u,6,4>: Cost 2 vext3 RHS, <u,6,4,6> + 2131951616U, // <7,u,6,5>: Cost 2 ins <u,5,6,5>, lane 0 + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1058226176U, // <7,u,6,7>: Cost 1 ins RHS, lane 0 + 1058226176U, // <7,u,6,u>: Cost 1 ins RHS, lane 0 + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 1640462603U, // <7,u,7,1>: Cost 2 vext3 RHS, <u,7,1,3> + 1993185070U, // <7,u,7,2>: Cost 2 vtrnl <7,7,7,7>, LHS + 1912799388U, // <7,u,7,3>: Cost 2 vzipr <5,5,7,7>, LHS + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1640462643U, // <7,u,7,5>: Cost 2 vext3 RHS, <u,7,5,7> + 1993185434U, // <7,u,7,6>: Cost 2 vtrnl <7,7,7,7>, RHS + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1055244288U, // <7,u,u,3>: Cost 1 ins LHS, lane 0 + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 1058226176U, // <7,u,u,7>: Cost 1 ins RHS, lane 0 + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2085707777U, // <u,0,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2080440323U, // <u,0,0,5>: Cost 2 ins <0,0,0,u>, lane 3 + 2080440323U, // <u,0,0,6>: Cost 2 ins <0,0,0,u>, lane 3 + 2080440323U, // <u,0,0,7>: Cost 2 ins <0,0,0,u>, lane 3 + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 786808934U, // <u,0,1,1>: Cost 1 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 1756332134U, // <u,0,1,3>: Cost 2 vuzpr <1,u,3,0>, LHS + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2085797889U, // <u,0,1,5>: Cost 2 ins <0,u,1,5>, lane 1 + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2080514051U, // <u,0,1,7>: Cost 2 ins <0,0,1,u>, lane 3 + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 1994768394U, // <u,0,2,1>: Cost 2 vtrnl LHS, <0,0,1,1> + 921026662U, // <u,0,2,2>: Cost 1 vtrnl LHS, LHS + 1012113409U, // <u,0,2,3>: Cost 1 ins LHS, lane 1 + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2080587779U, // <u,0,2,5>: Cost 2 ins <0,0,2,u>, lane 3 + 2085879809U, // <u,0,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2080587779U, // <u,0,2,7>: Cost 2 ins <0,0,2,u>, lane 3 + 921026716U, // <u,0,2,u>: Cost 1 vtrnl LHS, LHS + 1880326144U, // <u,0,3,0>: Cost 2 vzipr LHS, <0,0,0,0> + 1880327846U, // <u,0,3,1>: Cost 2 vzipr LHS, <2,3,0,1> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2091900929U, // <u,0,3,3>: Cost 2 ins <1,u,3,3>, lane 1 + 2091909121U, // <u,0,3,4>: Cost 2 ins <1,u,3,4>, lane 1 + 2086633475U, // <u,0,3,5>: Cost 2 ins <1,0,3,u>, lane 3 + 2086633475U, // <u,0,3,6>: Cost 2 ins <1,0,3,u>, lane 3 + 2091933697U, // <u,0,3,7>: Cost 2 ins <1,u,3,7>, lane 1 + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 1705610572U, // <u,0,4,0>: Cost 2 vuzpl <4,6,0,2>, <4,6,0,2> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 2086002689U, // <u,0,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 1947828428U, // <u,0,4,4>: Cost 2 vtrnl <0,2,4,6>, <0,2,4,6> + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1726844214U, // <u,0,4,6>: Cost 2 vuzpl <u,2,0,2>, RHS + 2109923329U, // <u,0,4,7>: Cost 2 ins <4,u,4,7>, lane 1 + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 1863532544U, // <u,0,5,0>: Cost 2 vzipl RHS, <0,0,0,0> + 789790822U, // <u,0,5,1>: Cost 1 vzipl RHS, LHS + 1996349542U, // <u,0,5,2>: Cost 2 vtrnl <u,3,5,7>, LHS + 2104696835U, // <u,0,5,3>: Cost 2 ins <4,0,5,u>, lane 3 + 1863532882U, // <u,0,5,4>: Cost 2 vzipl RHS, <0,4,1,5> + 2109980673U, // <u,0,5,5>: Cost 2 ins <4,u,5,5>, lane 1 + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 1756335414U, // <u,0,5,7>: Cost 2 vuzpr <1,u,3,0>, RHS + 789791389U, // <u,0,5,u>: Cost 1 vzipl RHS, LHS + 1997750272U, // <u,0,6,0>: Cost 2 vtrnl RHS, <0,0,0,0> + 1997750282U, // <u,0,6,1>: Cost 2 vtrnl RHS, <0,0,1,1> + 924008550U, // <u,0,6,2>: Cost 1 vtrnl RHS, LHS + 2104770563U, // <u,0,6,3>: Cost 2 ins <4,0,6,u>, lane 3 + 1146503858U, // <u,0,6,4>: Cost 2 vrev <0,u,4,6> + 2104770563U, // <u,0,6,5>: Cost 2 ins <4,0,6,u>, lane 3 + 2110062593U, // <u,0,6,6>: Cost 2 ins <4,u,6,6>, lane 1 + 1036328961U, // <u,0,6,7>: Cost 1 ins RHS, lane 1 + 924008604U, // <u,0,6,u>: Cost 1 vtrnl RHS, LHS + 1906900992U, // <u,0,7,0>: Cost 2 vzipr RHS, <0,0,0,0> + 1906902694U, // <u,0,7,1>: Cost 2 vzipr RHS, <2,3,0,1> + 1906901156U, // <u,0,7,2>: Cost 2 vzipr RHS, <0,2,0,2> + 2116083713U, // <u,0,7,3>: Cost 2 ins <5,u,7,3>, lane 1 + 2116091905U, // <u,0,7,4>: Cost 2 ins <5,u,7,4>, lane 1 + 2980643874U, // <u,0,7,5>: Cost 3 vzipr RHS, <1,4,0,5> + 2116108289U, // <u,0,7,6>: Cost 2 ins <5,u,7,6>, lane 1 + 2116116481U, // <u,0,7,7>: Cost 2 ins <5,u,7,7>, lane 1 + 1906901162U, // <u,0,7,u>: Cost 2 vzipr RHS, <0,2,0,u> + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 791453798U, // <u,0,u,1>: Cost 1 vzipl LHS, LHS + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 1012113409U, // <u,0,u,3>: Cost 1 ins LHS, lane 1 + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 1036328961U, // <u,0,u,7>: Cost 1 ins RHS, lane 1 + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1818149622U, // <u,1,0,0>: Cost 2 vzipl <1,0,3,2>, <1,0,3,2> + 1007951877U, // <u,1,0,1>: Cost 1 ins LHS, lane 5 + 1725587558U, // <u,1,0,2>: Cost 2 vuzpl <u,0,1,2>, LHS + 1007910914U, // <u,1,0,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <u,1,0,4>: Cost 2 ins <0,1,u,4>, lane 2 + 2081669122U, // <u,1,0,5>: Cost 2 ins <0,1,u,5>, lane 2 + 2081677314U, // <u,1,0,6>: Cost 2 ins <0,1,u,6>, lane 2 + 2081685506U, // <u,1,0,7>: Cost 2 ins <0,1,u,7>, lane 2 + 1007951877U, // <u,1,0,u>: Cost 1 ins LHS, lane 5 + 1481786002U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, <0,u,1,1> + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 1860551574U, // <u,1,1,2>: Cost 2 vzipl LHS, <1,2,3,0> + 1007910914U, // <u,1,1,3>: Cost 1 ins LHS, lane 2 + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 1860551824U, // <u,1,1,5>: Cost 2 vzipl LHS, <1,5,3,7> + 2081677314U, // <u,1,1,6>: Cost 2 ins <0,1,u,6>, lane 2 + 2081685506U, // <u,1,1,7>: Cost 2 ins <0,1,u,7>, lane 2 + 1007910914U, // <u,1,1,u>: Cost 1 ins LHS, lane 2 + 1007509507U, // <u,1,2,0>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,2,1>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,2,2>: Cost 1 ins LHS, lane 3 + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1007509507U, // <u,1,2,4>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,2,5>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,2,6>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,2,7>: Cost 1 ins LHS, lane 3 + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 1880328342U, // <u,1,3,2>: Cost 2 vzipr LHS, <3,0,1,2> + 945004646U, // <u,1,3,3>: Cost 1 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2087297027U, // <u,1,3,6>: Cost 2 ins <1,1,3,u>, lane 3 + 2133737476U, // <u,1,3,7>: Cost 2 ins <u,u,3,7>, lane 4 + 945004651U, // <u,1,3,u>: Cost 1 vtrnr LHS, LHS + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2081636354U, // <u,1,4,1>: Cost 2 ins <0,1,u,1>, lane 2 + 2081644546U, // <u,1,4,2>: Cost 2 ins <0,1,u,2>, lane 2 + 1007910914U, // <u,1,4,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <u,1,4,4>: Cost 2 ins <0,1,u,4>, lane 2 + 1007951877U, // <u,1,4,5>: Cost 1 ins LHS, lane 5 + 1725590838U, // <u,1,4,6>: Cost 2 vuzpl <u,0,1,2>, RHS + 2081685506U, // <u,1,4,7>: Cost 2 ins <0,1,u,7>, lane 2 + 1007910914U, // <u,1,4,u>: Cost 1 ins LHS, lane 2 + 1481818774U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, <0,u,1,5> + 1863533364U, // <u,1,5,1>: Cost 2 vzipl RHS, <1,1,1,1> + 1863533462U, // <u,1,5,2>: Cost 2 vzipl RHS, <1,2,3,0> + 1007910914U, // <u,1,5,3>: Cost 1 ins LHS, lane 2 + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 1863533712U, // <u,1,5,5>: Cost 2 vzipl RHS, <1,5,3,7> + 2133876740U, // <u,1,5,6>: Cost 2 ins <u,u,5,6>, lane 4 + 1750224182U, // <u,1,5,7>: Cost 2 vuzpr <0,u,1,1>, RHS + 1007910914U, // <u,1,5,u>: Cost 1 ins LHS, lane 2 + 2081628162U, // <u,1,6,0>: Cost 2 ins <0,1,u,0>, lane 2 + 1997751092U, // <u,1,6,1>: Cost 2 vtrnl RHS, <1,1,1,1> + 2133917700U, // <u,1,6,2>: Cost 2 ins <u,u,6,2>, lane 4 + 1007910914U, // <u,1,6,3>: Cost 1 ins LHS, lane 2 + 2081660930U, // <u,1,6,4>: Cost 2 ins <0,1,u,4>, lane 2 + 1997751296U, // <u,1,6,5>: Cost 2 vtrnl RHS, <1,3,5,7> + 2133950468U, // <u,1,6,6>: Cost 2 ins <u,u,6,6>, lane 4 + 1060216836U, // <u,1,6,7>: Cost 1 ins RHS, lane 4 + 1007910914U, // <u,1,6,u>: Cost 1 ins LHS, lane 2 + 2133975044U, // <u,1,7,0>: Cost 2 ins <u,u,7,0>, lane 4 + 1906901002U, // <u,1,7,1>: Cost 2 vzipr RHS, <0,0,1,1> + 1906903190U, // <u,1,7,2>: Cost 2 vzipr RHS, <3,0,1,2> + 969220198U, // <u,1,7,3>: Cost 1 vtrnr RHS, LHS + 2134007812U, // <u,1,7,4>: Cost 2 ins <u,u,7,4>, lane 4 + 1152558485U, // <u,1,7,5>: Cost 2 vrev <1,u,5,7> + 2134024196U, // <u,1,7,6>: Cost 2 ins <u,u,7,6>, lane 4 + 2134032388U, // <u,1,7,7>: Cost 2 ins <u,u,7,7>, lane 4 + 969220203U, // <u,1,7,u>: Cost 1 vtrnr RHS, LHS + 1007509507U, // <u,1,u,0>: Cost 1 ins LHS, lane 3 + 1007951877U, // <u,1,u,1>: Cost 1 ins LHS, lane 5 + 1007509507U, // <u,1,u,2>: Cost 1 ins LHS, lane 3 + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1007509507U, // <u,1,u,4>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,u,5>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,u,6>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,1,u,7>: Cost 1 ins LHS, lane 3 + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1726332928U, // <u,2,0,0>: Cost 2 vuzpl LHS, <0,0,0,0> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 652591206U, // <u,2,0,2>: Cost 1 vuzpl LHS, LHS + 1886937190U, // <u,2,0,3>: Cost 2 vzipr <1,2,u,0>, LHS + 1726333132U, // <u,2,0,4>: Cost 2 vuzpl LHS, <0,2,4,6> + 2081767427U, // <u,2,0,5>: Cost 2 ins <0,2,0,u>, lane 3 + 2082340866U, // <u,2,0,6>: Cost 2 ins <0,2,u,6>, lane 2 + 2081767427U, // <u,2,0,7>: Cost 2 ins <0,2,0,u>, lane 3 + 652591260U, // <u,2,0,u>: Cost 1 vuzpl LHS, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 1726333748U, // <u,2,1,1>: Cost 2 vuzpl LHS, <1,1,1,1> + 1860552296U, // <u,2,1,2>: Cost 2 vzipl LHS, <2,2,2,2> + 1750155366U, // <u,2,1,3>: Cost 2 vuzpr <0,u,0,2>, LHS + 2088296450U, // <u,2,1,4>: Cost 2 ins <1,2,u,4>, lane 2 + 1726333952U, // <u,2,1,5>: Cost 2 vuzpl LHS, <1,3,5,7> + 1860552634U, // <u,2,1,6>: Cost 2 vzipl LHS, <2,6,3,7> + 2109702145U, // <u,2,1,7>: Cost 2 ins <4,u,1,7>, lane 1 + 1750155371U, // <u,2,1,u>: Cost 2 vuzpr <0,u,0,2>, LHS + 1481867932U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, <0,u,2,2> + 2085838849U, // <u,2,2,1>: Cost 2 ins <0,u,2,1>, lane 1 + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1012113409U, // <u,2,2,3>: Cost 1 ins LHS, lane 1 + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2085871617U, // <u,2,2,5>: Cost 2 ins <0,u,2,5>, lane 1 + 2085879809U, // <u,2,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 2085888001U, // <u,2,2,7>: Cost 2 ins <0,u,2,7>, lane 1 + 1012113409U, // <u,2,2,u>: Cost 1 ins LHS, lane 1 + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1880326164U, // <u,2,3,2>: Cost 2 vzipr LHS, <0,0,2,2> + 806584422U, // <u,2,3,3>: Cost 1 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1726335490U, // <u,2,3,5>: Cost 2 vuzpl LHS, <3,4,5,6> + 1880326492U, // <u,2,3,6>: Cost 2 vzipr LHS, <0,4,2,6> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 806584427U, // <u,2,3,u>: Cost 1 vzipr LHS, LHS + 1726336332U, // <u,2,4,0>: Cost 2 vuzpl LHS, <4,6,0,2> + 2082062339U, // <u,2,4,1>: Cost 2 ins <0,2,4,u>, lane 3 + 2082308098U, // <u,2,4,2>: Cost 2 ins <0,2,u,2>, lane 2 + 1886969958U, // <u,2,4,3>: Cost 2 vzipr <1,2,u,4>, LHS + 1726336208U, // <u,2,4,4>: Cost 2 vuzpl LHS, <4,4,4,4> + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 652594486U, // <u,2,4,6>: Cost 1 vuzpl LHS, RHS + 2082062339U, // <u,2,4,7>: Cost 2 ins <0,2,4,u>, lane 3 + 652594504U, // <u,2,4,u>: Cost 1 vuzpl LHS, RHS + 2088263682U, // <u,2,5,0>: Cost 2 ins <1,2,u,0>, lane 2 + 1726337152U, // <u,2,5,1>: Cost 2 vuzpl LHS, <5,7,1,3> + 1863534184U, // <u,2,5,2>: Cost 2 vzipl RHS, <2,2,2,2> + 1884987494U, // <u,2,5,3>: Cost 2 vzipr <0,u,u,5>, LHS + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 1726337028U, // <u,2,5,5>: Cost 2 vuzpl LHS, <5,5,5,5> + 1863534522U, // <u,2,5,6>: Cost 2 vzipl RHS, <2,6,3,7> + 1750158646U, // <u,2,5,7>: Cost 2 vuzpr <0,u,0,2>, RHS + 1750158647U, // <u,2,5,u>: Cost 2 vuzpr <0,u,0,2>, RHS + 1481900704U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, <0,u,2,6> + 2110021633U, // <u,2,6,1>: Cost 2 ins <4,u,6,1>, lane 1 + 1997751912U, // <u,2,6,2>: Cost 2 vtrnl RHS, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2110054401U, // <u,2,6,5>: Cost 2 ins <4,u,6,5>, lane 1 + 1726337848U, // <u,2,6,6>: Cost 2 vuzpl LHS, <6,6,6,6> + 1036328961U, // <u,2,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <u,2,6,u>: Cost 1 ins RHS, lane 1 + 2042962838U, // <u,2,7,0>: Cost 2 vtrnr RHS, <1,2,3,0> + 1726338042U, // <u,2,7,1>: Cost 2 vuzpl LHS, <7,0,1,2> + 1906901012U, // <u,2,7,2>: Cost 2 vzipr RHS, <0,0,2,2> + 833159270U, // <u,2,7,3>: Cost 1 vzipr RHS, LHS + 2042962842U, // <u,2,7,4>: Cost 2 vtrnr RHS, <1,2,3,4> + 1726338406U, // <u,2,7,5>: Cost 2 vuzpl LHS, <7,4,5,6> + 1906901340U, // <u,2,7,6>: Cost 2 vzipr RHS, <0,4,2,6> + 1726338668U, // <u,2,7,7>: Cost 2 vuzpl LHS, <7,7,7,7> + 833159275U, // <u,2,7,u>: Cost 1 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 652597038U, // <u,2,u,2>: Cost 1 vuzpl LHS, LHS + 806625382U, // <u,2,u,3>: Cost 1 vzipr LHS, LHS + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 652597402U, // <u,2,u,6>: Cost 1 vuzpl LHS, RHS + 1036328961U, // <u,2,u,7>: Cost 1 ins RHS, lane 1 + 806625387U, // <u,2,u,u>: Cost 1 vzipr LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2088951810U, // <u,3,0,3>: Cost 2 ins <1,3,u,3>, lane 2 + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2094940162U, // <u,3,0,5>: Cost 2 ins <2,3,u,5>, lane 2 + 2094374915U, // <u,3,0,6>: Cost 2 ins <2,3,0,u>, lane 3 + 2088984578U, // <u,3,0,7>: Cost 2 ins <1,3,u,7>, lane 2 + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 676569190U, // <u,3,1,3>: Cost 1 vuzpr LHS, LHS + 1860553218U, // <u,3,1,4>: Cost 2 vzipl LHS, <3,4,5,6> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2088476675U, // <u,3,1,6>: Cost 2 ins <1,3,1,u>, lane 3 + 2088984578U, // <u,3,1,7>: Cost 2 ins <1,3,u,7>, lane 2 + 676569195U, // <u,3,1,u>: Cost 1 vuzpr LHS, LHS + 1750311830U, // <u,3,2,0>: Cost 2 vuzpr LHS, <1,2,3,0> + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1012113409U, // <u,3,2,3>: Cost 1 ins LHS, lane 1 + 1750311834U, // <u,3,2,4>: Cost 2 vuzpr LHS, <1,2,3,4> + 1994770946U, // <u,3,2,5>: Cost 2 vtrnl LHS, <3,4,5,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2088984578U, // <u,3,2,7>: Cost 2 ins <1,3,u,7>, lane 2 + 1012113409U, // <u,3,2,u>: Cost 1 ins LHS, lane 1 + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1750312614U, // <u,3,3,1>: Cost 2 vuzpr LHS, <2,3,0,1> + 1880326902U, // <u,3,3,2>: Cost 2 vzipr LHS, <1,0,3,2> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 1750312654U, // <u,3,3,5>: Cost 2 vuzpr LHS, <2,3,4,5> + 2100568067U, // <u,3,3,6>: Cost 2 ins <3,3,3,u>, lane 3 + 1880327312U, // <u,3,3,7>: Cost 2 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2094669827U, // <u,3,4,2>: Cost 2 ins <2,3,4,u>, lane 3 + 2088951810U, // <u,3,4,3>: Cost 2 ins <1,3,u,3>, lane 2 + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1750311260U, // <u,3,4,6>: Cost 2 vuzpr LHS, <0,4,2,6> + 2088984578U, // <u,3,4,7>: Cost 2 ins <1,3,u,7>, lane 2 + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 1863534742U, // <u,3,5,0>: Cost 2 vzipl RHS, <3,0,1,2> + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2088771587U, // <u,3,5,2>: Cost 2 ins <1,3,5,u>, lane 3 + 1863535004U, // <u,3,5,3>: Cost 2 vzipl RHS, <3,3,3,3> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 676572470U, // <u,3,5,7>: Cost 1 vuzpr LHS, RHS + 676572471U, // <u,3,5,u>: Cost 1 vuzpr LHS, RHS + 1798090850U, // <u,3,6,0>: Cost 2 vuzpr LHS, <5,6,7,0> + 1997752470U, // <u,3,6,1>: Cost 2 vtrnl RHS, <3,0,1,2> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 1997752732U, // <u,3,6,3>: Cost 2 vtrnl RHS, <3,3,3,3> + 1798090854U, // <u,3,6,4>: Cost 2 vuzpr LHS, <5,6,7,4> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1060216836U, // <u,3,6,7>: Cost 1 ins RHS, lane 4 + 1060216836U, // <u,3,6,u>: Cost 1 ins RHS, lane 4 + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 1906901832U, // <u,3,7,3>: Cost 2 vzipr RHS, <1,1,3,3> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2042963662U, // <u,3,7,5>: Cost 2 vtrnr RHS, <2,3,4,5> + 2134024196U, // <u,3,7,6>: Cost 2 ins <u,u,7,6>, lane 4 + 1906902160U, // <u,3,7,7>: Cost 2 vzipr RHS, <1,5,3,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1880367862U, // <u,3,u,2>: Cost 2 vzipr LHS, <1,0,3,2> + 676569757U, // <u,3,u,3>: Cost 1 vuzpr LHS, LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1750311584U, // <u,3,u,6>: Cost 2 vuzpr LHS, <0,u,2,6> + 676572713U, // <u,3,u,7>: Cost 1 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 1974046028U, // <u,4,0,0>: Cost 2 vtrnl <4,6,0,2>, <4,6,0,2> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1727168614U, // <u,4,0,2>: Cost 2 vuzpl <u,2,4,6>, LHS + 2085707777U, // <u,4,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 1679392972U, // <u,4,0,4>: Cost 2 vuzpl <0,2,4,6>, <0,2,4,6> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 2109628417U, // <u,4,0,7>: Cost 2 ins <4,u,0,7>, lane 1 + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 1860553618U, // <u,4,1,0>: Cost 2 vzipl LHS, <4,0,5,1> + 2085765121U, // <u,4,1,1>: Cost 2 ins <0,u,1,1>, lane 1 + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 1756364902U, // <u,4,1,3>: Cost 2 vuzpr <1,u,3,4>, LHS + 1860553936U, // <u,4,1,4>: Cost 2 vzipl LHS, <4,4,4,4> + 786812214U, // <u,4,1,5>: Cost 1 vzipl LHS, RHS + 1994026294U, // <u,4,1,6>: Cost 2 vtrnl <u,0,1,2>, RHS + 2083168259U, // <u,4,1,7>: Cost 2 ins <0,4,1,u>, lane 3 + 786812457U, // <u,4,1,u>: Cost 1 vzipl LHS, RHS + 1170066926U, // <u,4,2,0>: Cost 2 vrev <4,u,0,2> + 2083241987U, // <u,4,2,1>: Cost 2 ins <0,4,2,u>, lane 3 + 2085847041U, // <u,4,2,2>: Cost 2 ins <0,u,2,2>, lane 1 + 1012113409U, // <u,4,2,3>: Cost 1 ins LHS, lane 1 + 1994771664U, // <u,4,2,4>: Cost 2 vtrnl LHS, <4,4,4,4> + 1994771346U, // <u,4,2,5>: Cost 2 vtrnl LHS, <4,0,5,1> + 921029942U, // <u,4,2,6>: Cost 1 vtrnl LHS, RHS + 2083241987U, // <u,4,2,7>: Cost 2 ins <0,4,2,u>, lane 3 + 921029960U, // <u,4,2,u>: Cost 1 vtrnl LHS, RHS + 2091876353U, // <u,4,3,0>: Cost 2 ins <1,u,3,0>, lane 1 + 2954070192U, // <u,4,3,1>: Cost 3 vzipr LHS, <3,0,4,1> + 2091892737U, // <u,4,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 2091900929U, // <u,4,3,3>: Cost 2 ins <1,u,3,3>, lane 1 + 1928105168U, // <u,4,3,4>: Cost 2 vzipr LHS, <4,4,4,4> + 1880327886U, // <u,4,3,5>: Cost 2 vzipr LHS, <2,3,4,5> + 1880326348U, // <u,4,3,6>: Cost 2 vzipr LHS, <0,2,4,6> + 2091933697U, // <u,4,3,7>: Cost 2 ins <1,u,3,7>, lane 1 + 1880326350U, // <u,4,3,u>: Cost 2 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2107277315U, // <u,4,4,1>: Cost 2 ins <4,4,4,u>, lane 3 + 2107277315U, // <u,4,4,2>: Cost 2 ins <4,4,4,u>, lane 3 + 2086002689U, // <u,4,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2109923329U, // <u,4,4,7>: Cost 2 ins <4,u,4,7>, lane 1 + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2101379075U, // <u,4,5,1>: Cost 2 ins <3,4,5,u>, lane 3 + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2101379075U, // <u,4,5,3>: Cost 2 ins <3,4,5,u>, lane 3 + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 789794102U, // <u,4,5,5>: Cost 1 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 1756368182U, // <u,4,5,7>: Cost 2 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482048178U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, <0,u,4,6> + 2107424771U, // <u,4,6,1>: Cost 2 ins <4,4,6,u>, lane 3 + 2110029825U, // <u,4,6,2>: Cost 2 ins <4,u,6,2>, lane 1 + 2107424771U, // <u,4,6,3>: Cost 2 ins <4,4,6,u>, lane 3 + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 1997753234U, // <u,4,6,5>: Cost 2 vtrnl RHS, <4,0,5,1> + 924011830U, // <u,4,6,6>: Cost 1 vtrnl RHS, RHS + 1036328961U, // <u,4,6,7>: Cost 1 ins RHS, lane 1 + 924011848U, // <u,4,6,u>: Cost 1 vtrnl RHS, RHS + 2116059137U, // <u,4,7,0>: Cost 2 ins <5,u,7,0>, lane 1 + 2113470467U, // <u,4,7,1>: Cost 2 ins <5,4,7,u>, lane 3 + 2113470467U, // <u,4,7,2>: Cost 2 ins <5,4,7,u>, lane 3 + 2116083713U, // <u,4,7,3>: Cost 2 ins <5,u,7,3>, lane 1 + 1906904272U, // <u,4,7,4>: Cost 2 vzipr RHS, <4,4,4,4> + 1906902734U, // <u,4,7,5>: Cost 2 vzipr RHS, <2,3,4,5> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2116116481U, // <u,4,7,7>: Cost 2 ins <5,u,7,7>, lane 1 + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482064564U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, <0,u,4,u> + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 1012113409U, // <u,4,u,3>: Cost 1 ins LHS, lane 1 + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 791457078U, // <u,4,u,5>: Cost 1 vzipl LHS, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 1036328961U, // <u,4,u,7>: Cost 1 ins RHS, lane 1 + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2085683201U, // <u,5,0,0>: Cost 2 ins <0,u,0,0>, lane 1 + 1034493957U, // <u,5,0,1>: Cost 1 ins RHS, lane 5 + 1727914086U, // <u,5,0,2>: Cost 2 vuzpl <u,3,5,7>, LHS + 2085707777U, // <u,5,0,3>: Cost 2 ins <0,u,0,3>, lane 1 + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 1678778497U, // <u,5,0,5>: Cost 2 vuzpl <0,1,5,3>, <0,1,5,3> + 2108219394U, // <u,5,0,6>: Cost 2 ins <4,5,u,6>, lane 2 + 1034485762U, // <u,5,0,7>: Cost 1 ins RHS, lane 2 + 1034493957U, // <u,5,0,u>: Cost 1 ins RHS, lane 5 + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 1860554448U, // <u,5,1,1>: Cost 2 vzipl LHS, <5,1,7,3> + 2103689217U, // <u,5,1,2>: Cost 2 ins <3,u,1,2>, lane 1 + 1750253670U, // <u,5,1,3>: Cost 2 vuzpr <0,u,1,5>, LHS + 1505971738U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, <4,u,5,1> + 1860554756U, // <u,5,1,5>: Cost 2 vzipl LHS, <5,5,5,5> + 1860554850U, // <u,5,1,6>: Cost 2 vzipl LHS, <5,6,7,0> + 1034485762U, // <u,5,1,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <u,5,1,u>: Cost 1 ins RHS, lane 2 + 2085830657U, // <u,5,2,0>: Cost 2 ins <0,u,2,0>, lane 1 + 1994772608U, // <u,5,2,1>: Cost 2 vtrnl LHS, <5,7,1,3> + 2085847041U, // <u,5,2,2>: Cost 2 ins <0,u,2,2>, lane 1 + 1012113409U, // <u,5,2,3>: Cost 1 ins LHS, lane 1 + 2085863425U, // <u,5,2,4>: Cost 2 ins <0,u,2,4>, lane 1 + 1994772484U, // <u,5,2,5>: Cost 2 vtrnl LHS, <5,5,5,5> + 2085879809U, // <u,5,2,6>: Cost 2 ins <0,u,2,6>, lane 1 + 1034485762U, // <u,5,2,7>: Cost 1 ins RHS, lane 2 + 1012113409U, // <u,5,2,u>: Cost 1 ins LHS, lane 1 + 2091876353U, // <u,5,3,0>: Cost 2 ins <1,u,3,0>, lane 1 + 1176121553U, // <u,5,3,1>: Cost 2 vrev <5,u,1,3> + 2091892737U, // <u,5,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 2091900929U, // <u,5,3,3>: Cost 2 ins <1,u,3,3>, lane 1 + 2091909121U, // <u,5,3,4>: Cost 2 ins <1,u,3,4>, lane 1 + 1928105178U, // <u,5,3,5>: Cost 2 vzipr LHS, <4,4,5,5> + 1880328706U, // <u,5,3,6>: Cost 2 vzipr LHS, <3,4,5,6> + 945007926U, // <u,5,3,7>: Cost 1 vtrnr LHS, RHS + 945007927U, // <u,5,3,u>: Cost 1 vtrnr LHS, RHS + 2108170242U, // <u,5,4,0>: Cost 2 ins <4,5,u,0>, lane 2 + 2108178434U, // <u,5,4,1>: Cost 2 ins <4,5,u,1>, lane 2 + 2108186626U, // <u,5,4,2>: Cost 2 ins <4,5,u,2>, lane 2 + 2086002689U, // <u,5,4,3>: Cost 2 ins <0,u,4,3>, lane 1 + 1845022662U, // <u,5,4,4>: Cost 2 vzipl <5,4,7,6>, <5,4,7,6> + 1034493957U, // <u,5,4,5>: Cost 1 ins RHS, lane 5 + 1727917366U, // <u,5,4,6>: Cost 2 vuzpl <u,3,5,7>, RHS + 1034485762U, // <u,5,4,7>: Cost 1 ins RHS, lane 2 + 1034493957U, // <u,5,4,u>: Cost 1 ins RHS, lane 5 + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 1863536336U, // <u,5,5,1>: Cost 2 vzipl RHS, <5,1,7,3> + 2108186626U, // <u,5,5,2>: Cost 2 ins <4,5,u,2>, lane 2 + 2086076417U, // <u,5,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 1506004510U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, <4,u,5,5> + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 1863536738U, // <u,5,5,6>: Cost 2 vzipl RHS, <5,6,7,0> + 1034485762U, // <u,5,5,7>: Cost 1 ins RHS, lane 2 + 1034485762U, // <u,5,5,u>: Cost 1 ins RHS, lane 2 + 1034346499U, // <u,5,6,0>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,5,6,1>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,5,6,2>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,5,6,3>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,5,6,4>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,5,6,5>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,5,6,6>: Cost 1 ins RHS, lane 3 + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2114134019U, // <u,5,7,2>: Cost 2 ins <5,5,7,u>, lane 3 + 2133999620U, // <u,5,7,3>: Cost 2 ins <u,u,7,3>, lane 4 + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 1906903554U, // <u,5,7,6>: Cost 2 vzipr RHS, <3,4,5,6> + 969223478U, // <u,5,7,7>: Cost 1 vtrnr RHS, RHS + 969223479U, // <u,5,7,u>: Cost 1 vtrnr RHS, RHS + 1034346499U, // <u,5,u,0>: Cost 1 ins RHS, lane 3 + 1034493957U, // <u,5,u,1>: Cost 1 ins RHS, lane 5 + 1034346499U, // <u,5,u,2>: Cost 1 ins RHS, lane 3 + 1012113409U, // <u,5,u,3>: Cost 1 ins LHS, lane 1 + 1034346499U, // <u,5,u,4>: Cost 1 ins RHS, lane 3 + 1034493957U, // <u,5,u,5>: Cost 1 ins RHS, lane 5 + 1034346499U, // <u,5,u,6>: Cost 1 ins RHS, lane 3 + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 1729314816U, // <u,6,0,0>: Cost 2 vuzpl RHS, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 655573094U, // <u,6,0,2>: Cost 1 vuzpl RHS, LHS + 2108309507U, // <u,6,0,3>: Cost 2 ins <4,6,0,u>, lane 3 + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2108309507U, // <u,6,0,5>: Cost 2 ins <4,6,0,u>, lane 3 + 2108882946U, // <u,6,0,6>: Cost 2 ins <4,6,u,6>, lane 2 + 1886940470U, // <u,6,0,7>: Cost 2 vzipr <1,2,u,0>, RHS + 655573148U, // <u,6,0,u>: Cost 1 vuzpl RHS, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 1729315636U, // <u,6,1,1>: Cost 2 vuzpl RHS, <1,1,1,1> + 1860555258U, // <u,6,1,2>: Cost 2 vzipl LHS, <6,2,7,3> + 1750335590U, // <u,6,1,3>: Cost 2 vuzpr <0,u,2,6>, LHS + 2114838530U, // <u,6,1,4>: Cost 2 ins <5,6,u,4>, lane 2 + 1729315840U, // <u,6,1,5>: Cost 2 vuzpl RHS, <1,3,5,7> + 1860555576U, // <u,6,1,6>: Cost 2 vzipl LHS, <6,6,6,6> + 1884958006U, // <u,6,1,7>: Cost 2 vzipr <0,u,u,1>, RHS + 1750335595U, // <u,6,1,u>: Cost 2 vuzpr <0,u,2,6>, LHS + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2085838849U, // <u,6,2,1>: Cost 2 ins <0,u,2,1>, lane 1 + 1729316456U, // <u,6,2,2>: Cost 2 vuzpl RHS, <2,2,2,2> + 1012113409U, // <u,6,2,3>: Cost 1 ins LHS, lane 1 + 1506053668U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, <4,u,6,2> + 2085871617U, // <u,6,2,5>: Cost 2 ins <0,u,2,5>, lane 1 + 1994773304U, // <u,6,2,6>: Cost 2 vtrnl LHS, <6,6,6,6> + 1880984886U, // <u,6,2,7>: Cost 2 vzipr <0,2,u,2>, RHS + 1012113409U, // <u,6,2,u>: Cost 1 ins LHS, lane 1 + 2066526306U, // <u,6,3,0>: Cost 2 vtrnr LHS, <5,6,7,0> + 1729317014U, // <u,6,3,1>: Cost 2 vuzpl RHS, <3,0,1,2> + 1928104860U, // <u,6,3,2>: Cost 2 vzipr LHS, <4,0,6,2> + 1729317276U, // <u,6,3,3>: Cost 2 vuzpl RHS, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 1729317378U, // <u,6,3,5>: Cost 2 vuzpl RHS, <3,4,5,6> + 1928105188U, // <u,6,3,6>: Cost 2 vzipr LHS, <4,4,6,6> + 806587702U, // <u,6,3,7>: Cost 1 vzipr LHS, RHS + 806587703U, // <u,6,3,u>: Cost 1 vzipr LHS, RHS + 1729318220U, // <u,6,4,0>: Cost 2 vuzpl RHS, <4,6,0,2> + 2108604419U, // <u,6,4,1>: Cost 2 ins <4,6,4,u>, lane 3 + 2108850178U, // <u,6,4,2>: Cost 2 ins <4,6,u,2>, lane 2 + 2108604419U, // <u,6,4,3>: Cost 2 ins <4,6,4,u>, lane 3 + 1729318096U, // <u,6,4,4>: Cost 2 vuzpl RHS, <4,4,4,4> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 655576374U, // <u,6,4,6>: Cost 1 vuzpl RHS, RHS + 1886973238U, // <u,6,4,7>: Cost 2 vzipr <1,2,u,4>, RHS + 655576392U, // <u,6,4,u>: Cost 1 vuzpl RHS, RHS + 2114805762U, // <u,6,5,0>: Cost 2 ins <5,6,u,0>, lane 2 + 1729319040U, // <u,6,5,1>: Cost 2 vuzpl RHS, <5,7,1,3> + 1863537146U, // <u,6,5,2>: Cost 2 vzipl RHS, <6,2,7,3> + 2086076417U, // <u,6,5,3>: Cost 2 ins <0,u,5,3>, lane 1 + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 1729318916U, // <u,6,5,5>: Cost 2 vuzpl RHS, <5,5,5,5> + 1863537464U, // <u,6,5,6>: Cost 2 vzipl RHS, <6,6,6,6> + 1750338870U, // <u,6,5,7>: Cost 2 vuzpr <0,u,2,6>, RHS + 1750338871U, // <u,6,5,u>: Cost 2 vuzpr <0,u,2,6>, RHS + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2110021633U, // <u,6,6,1>: Cost 2 ins <4,u,6,1>, lane 1 + 2110029825U, // <u,6,6,2>: Cost 2 ins <4,u,6,2>, lane 1 + 2086150145U, // <u,6,6,3>: Cost 2 ins <0,u,6,3>, lane 1 + 1506086440U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, <4,u,6,6> + 2110054401U, // <u,6,6,5>: Cost 2 ins <4,u,6,5>, lane 1 + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1036328961U, // <u,6,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <u,6,6,u>: Cost 1 ins RHS, lane 1 + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1906903964U, // <u,6,7,2>: Cost 2 vzipr RHS, <4,0,6,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1906904292U, // <u,6,7,6>: Cost 2 vzipr RHS, <4,4,6,6> + 833162550U, // <u,6,7,7>: Cost 1 vzipr RHS, RHS + 833162551U, // <u,6,7,u>: Cost 1 vzipr RHS, RHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 655578926U, // <u,6,u,2>: Cost 1 vuzpl RHS, LHS + 1012113409U, // <u,6,u,3>: Cost 1 ins LHS, lane 1 + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 655579290U, // <u,6,u,6>: Cost 1 vuzpl RHS, RHS + 806628662U, // <u,6,u,7>: Cost 1 vzipr LHS, RHS + 806628663U, // <u,6,u,u>: Cost 1 vzipr LHS, RHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2115493890U, // <u,7,0,3>: Cost 2 ins <5,7,u,3>, lane 2 + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2120916995U, // <u,7,0,6>: Cost 2 ins <6,7,0,u>, lane 3 + 2115526658U, // <u,7,0,7>: Cost 2 ins <5,7,u,7>, lane 2 + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 700784742U, // <u,7,1,3>: Cost 1 vuzpr RHS, LHS + 1860556134U, // <u,7,1,4>: Cost 2 vzipl LHS, <7,4,5,6> + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2115018755U, // <u,7,1,6>: Cost 2 ins <5,7,1,u>, lane 3 + 1860556396U, // <u,7,1,7>: Cost 2 vzipl LHS, <7,7,7,7> + 700784747U, // <u,7,1,u>: Cost 1 vuzpr RHS, LHS + 1774527382U, // <u,7,2,0>: Cost 2 vuzpr RHS, <1,2,3,0> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1012113409U, // <u,7,2,3>: Cost 1 ins LHS, lane 1 + 1774527386U, // <u,7,2,4>: Cost 2 vuzpr RHS, <1,2,3,4> + 1994773862U, // <u,7,2,5>: Cost 2 vtrnl LHS, <7,4,5,6> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 1994774124U, // <u,7,2,7>: Cost 2 vtrnl LHS, <7,7,7,7> + 1012113409U, // <u,7,2,u>: Cost 1 ins LHS, lane 1 + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 1774528166U, // <u,7,3,1>: Cost 2 vuzpr RHS, <2,3,0,1> + 2091892737U, // <u,7,3,2>: Cost 2 ins <1,u,3,2>, lane 1 + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1774528206U, // <u,7,3,5>: Cost 2 vuzpr RHS, <2,3,4,5> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 1774527488U, // <u,7,3,7>: Cost 2 vuzpr RHS, <1,3,5,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2121449474U, // <u,7,4,1>: Cost 2 ins <6,7,u,1>, lane 2 + 2121211907U, // <u,7,4,2>: Cost 2 ins <6,7,4,u>, lane 3 + 2115493890U, // <u,7,4,3>: Cost 2 ins <5,7,u,3>, lane 2 + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360076U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,0,2> + 2115526658U, // <u,7,4,7>: Cost 2 ins <5,7,u,7>, lane 2 + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 1863537658U, // <u,7,5,0>: Cost 2 vzipl RHS, <7,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2115313667U, // <u,7,5,2>: Cost 2 ins <5,7,5,u>, lane 3 + 2115493890U, // <u,7,5,3>: Cost 2 ins <5,7,u,3>, lane 2 + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 700788022U, // <u,7,5,7>: Cost 1 vuzpr RHS, RHS + 700788023U, // <u,7,5,u>: Cost 1 vuzpr RHS, RHS + 1774530658U, // <u,7,6,0>: Cost 2 vuzpr RHS, <5,6,7,0> + 1997755386U, // <u,7,6,1>: Cost 2 vtrnl RHS, <7,0,1,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2115493890U, // <u,7,6,3>: Cost 2 ins <5,7,u,3>, lane 2 + 1774530662U, // <u,7,6,4>: Cost 2 vuzpr RHS, <5,6,7,4> + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1036328961U, // <u,7,6,7>: Cost 1 ins RHS, lane 1 + 1036328961U, // <u,7,6,u>: Cost 1 ins RHS, lane 1 + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 1774531406U, // <u,7,7,1>: Cost 2 vuzpr RHS, <6,7,0,1> + 2127405059U, // <u,7,7,2>: Cost 2 ins <7,7,7,u>, lane 3 + 1906904784U, // <u,7,7,3>: Cost 2 vzipr RHS, <5,1,7,3> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1774531446U, // <u,7,7,5>: Cost 2 vuzpr RHS, <6,7,4,5> + 1906905030U, // <u,7,7,6>: Cost 2 vzipr RHS, <5,4,7,6> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 700785309U, // <u,7,u,3>: Cost 1 vuzpr RHS, LHS + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 700788265U, // <u,7,u,7>: Cost 1 vuzpr RHS, RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 653033574U, // <u,u,0,2>: Cost 1 vuzpl LHS, LHS + 1007910914U, // <u,u,0,3>: Cost 1 ins LHS, lane 2 + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1995282586U, // <u,u,0,6>: Cost 2 vtrnl <u,2,0,2>, RHS + 1034485762U, // <u,u,0,7>: Cost 1 ins RHS, lane 2 + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 786814766U, // <u,u,1,1>: Cost 1 vzipl LHS, LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 676610150U, // <u,u,1,3>: Cost 1 vuzpr LHS, LHS + 1482304822U, // <u,u,1,4>: Cost 2 vext1 <0,u,u,1>, RHS + 786815130U, // <u,u,1,5>: Cost 1 vzipl LHS, RHS + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1034485762U, // <u,u,1,7>: Cost 1 ins RHS, lane 2 + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1007509507U, // <u,u,2,0>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,u,2,1>: Cost 1 ins LHS, lane 3 + 921032494U, // <u,u,2,2>: Cost 1 vtrnl LHS, LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1007509507U, // <u,u,2,4>: Cost 1 ins LHS, lane 3 + 1007509507U, // <u,u,2,5>: Cost 1 ins LHS, lane 3 + 921032858U, // <u,u,2,6>: Cost 1 vtrnl LHS, RHS + 1007509507U, // <u,u,2,7>: Cost 1 ins LHS, lane 3 + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1880327918U, // <u,u,3,1>: Cost 2 vzipr LHS, <2,3,u,1> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 806584476U, // <u,u,3,3>: Cost 1 vzipr LHS, LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1880327922U, // <u,u,3,5>: Cost 2 vzipr LHS, <2,3,u,5> + 1880326384U, // <u,u,3,6>: Cost 2 vzipr LHS, <0,2,u,6> + 806587720U, // <u,u,3,7>: Cost 1 vzipr LHS, RHS + 806584481U, // <u,u,3,u>: Cost 1 vzipr LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1007910914U, // <u,u,4,3>: Cost 1 ins LHS, lane 2 + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 653036854U, // <u,u,4,6>: Cost 1 vuzpl LHS, RHS + 1034485762U, // <u,u,4,7>: Cost 1 ins RHS, lane 2 + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1482334933U, // <u,u,5,0>: Cost 2 vext1 <0,u,u,5>, <0,u,u,5> + 789796654U, // <u,u,5,1>: Cost 1 vzipl RHS, LHS + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1007910914U, // <u,u,5,3>: Cost 1 ins LHS, lane 2 + 1482337590U, // <u,u,5,4>: Cost 2 vext1 <0,u,u,5>, RHS + 789797018U, // <u,u,5,5>: Cost 1 vzipl RHS, RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 676613430U, // <u,u,5,7>: Cost 1 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1034346499U, // <u,u,6,0>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,u,6,1>: Cost 1 ins RHS, lane 3 + 924014382U, // <u,u,6,2>: Cost 1 vtrnl RHS, LHS + 1007910914U, // <u,u,6,3>: Cost 1 ins LHS, lane 2 + 1034346499U, // <u,u,6,4>: Cost 1 ins RHS, lane 3 + 1034346499U, // <u,u,6,5>: Cost 1 ins RHS, lane 3 + 924014746U, // <u,u,6,6>: Cost 1 vtrnl RHS, RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 833159324U, // <u,u,7,3>: Cost 1 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1906901393U, // <u,u,7,5>: Cost 2 vzipr RHS, <0,4,u,5> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 833162568U, // <u,u,7,7>: Cost 1 vzipr RHS, RHS + 833159329U, // <u,u,7,u>: Cost 1 vzipr RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0}; + +static unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) { + assert(M.size() == 4 && "Expected a 4 entry perfect shuffle"); + + // Special case zero-cost nop copies, from either LHS or RHS. + if (llvm::all_of(llvm::enumerate(M), [](auto &E) { + return E.value() < 0 || E.value() == (int)E.index(); + })) + return 0; + if (llvm::all_of(llvm::enumerate(M), [](auto &E) { + return E.value() < 0 || E.value() == (int)E.index() + 4; + })) + return 0; + + // Get the four mask elementd from the 2 inputs. Perfect shuffles encode undef + // elements with value 8. + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + assert(M[i] < 8 && "Expected a maximum entry of 8 for shuffle mask"); + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + // And extract the cost from the upper bits. The cost is encoded as Cost-1. + return (PFEntry >> 30) + 1; +} #endif diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index d1b901e58d27..f7c06b9fb71b 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -19,6 +19,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -32,6 +33,8 @@ using namespace llvm; +#define GET_CC_REGISTER_LISTS +#include "AArch64GenCallingConv.inc" #define GET_REGINFO_TARGET_DESC #include "AArch64GenRegisterInfo.inc" @@ -63,14 +66,6 @@ bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg, return true; } -bool AArch64RegisterInfo::hasSVEArgsOrReturn(const MachineFunction *MF) { - const Function &F = MF->getFunction(); - return isa<ScalableVectorType>(F.getReturnType()) || - any_of(F.args(), [](const Argument &Arg) { - return isa<ScalableVectorType>(Arg.getType()); - }); -} - const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); @@ -108,7 +103,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { // This is for OSes other than Windows; Windows is a separate case further // above. return CSR_AArch64_AAPCS_X18_SaveList; - if (hasSVEArgsOrReturn(MF)) + if (MF->getInfo<AArch64FunctionInfo>()->isSVECC()) return CSR_AArch64_SVE_AAPCS_SaveList; return CSR_AArch64_AAPCS_SaveList; } @@ -335,6 +330,13 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) markSuperRegs(Reserved, AArch64::W16); + // SME tiles are not allocatable. + if (MF.getSubtarget<AArch64Subtarget>().hasSME()) { + for (MCSubRegIterator SubReg(AArch64::ZA, this, /*self=*/true); + SubReg.isValid(); ++SubReg) + Reserved.set(*SubReg); + } + assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } @@ -417,6 +419,68 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { return false; } +bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF, + MCRegister Reg) const { + CallingConv::ID CC = MF.getFunction().getCallingConv(); + const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); + bool IsVarArg = STI.isCallingConvWin64(MF.getFunction().getCallingConv()); + + auto HasReg = [](ArrayRef<MCRegister> RegList, MCRegister Reg) { + return llvm::any_of(RegList, + [Reg](const MCRegister R) { return R == Reg; }); + }; + + switch (CC) { + default: + report_fatal_error("Unsupported calling convention."); + case CallingConv::WebKit_JS: + return HasReg(CC_AArch64_WebKit_JS_ArgRegs, Reg); + case CallingConv::GHC: + return HasReg(CC_AArch64_GHC_ArgRegs, Reg); + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::PreserveMost: + case CallingConv::CXX_FAST_TLS: + case CallingConv::Swift: + case CallingConv::SwiftTail: + case CallingConv::Tail: + if (STI.isTargetWindows() && IsVarArg) + return HasReg(CC_AArch64_Win64_VarArg_ArgRegs, Reg); + if (!STI.isTargetDarwin()) { + switch (CC) { + default: + return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg); + case CallingConv::Swift: + case CallingConv::SwiftTail: + return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg) || + HasReg(CC_AArch64_AAPCS_Swift_ArgRegs, Reg); + } + } + if (!IsVarArg) { + switch (CC) { + default: + return HasReg(CC_AArch64_DarwinPCS_ArgRegs, Reg); + case CallingConv::Swift: + case CallingConv::SwiftTail: + return HasReg(CC_AArch64_DarwinPCS_ArgRegs, Reg) || + HasReg(CC_AArch64_DarwinPCS_Swift_ArgRegs, Reg); + } + } + if (STI.isTargetILP32()) + return HasReg(CC_AArch64_DarwinPCS_ILP32_VarArg_ArgRegs, Reg); + return HasReg(CC_AArch64_DarwinPCS_VarArg_ArgRegs, Reg); + case CallingConv::Win64: + if (IsVarArg) + HasReg(CC_AArch64_Win64_VarArg_ArgRegs, Reg); + return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg); + case CallingConv::CFGuard_Check: + return HasReg(CC_AArch64_Win64_CFGuard_Check_ArgRegs, Reg); + case CallingConv::AArch64_VectorCall: + case CallingConv::AArch64_SVE_VectorCall: + return HasReg(CC_AArch64_AAPCS_ArgRegs, Reg); + } +} + Register AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); @@ -588,23 +652,31 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, // Create a scratch register for the frame index elimination in an instruction. // This function has special handling of stack tagging loop pseudos, in which -// case it can also change the instruction opcode (but not the operands). +// case it can also change the instruction opcode. static Register -createScratchRegisterForInstruction(MachineInstr &MI, +createScratchRegisterForInstruction(MachineInstr &MI, unsigned FIOperandNum, const AArch64InstrInfo *TII) { // ST*Gloop have a reserved scratch register in operand 1. Use it, and also // replace the instruction with the writeback variant because it will now // satisfy the operand constraints for it. - if (MI.getOpcode() == AArch64::STGloop) { - MI.setDesc(TII->get(AArch64::STGloop_wback)); - return MI.getOperand(1).getReg(); - } else if (MI.getOpcode() == AArch64::STZGloop) { - MI.setDesc(TII->get(AArch64::STZGloop_wback)); - return MI.getOperand(1).getReg(); + Register ScratchReg; + if (MI.getOpcode() == AArch64::STGloop || + MI.getOpcode() == AArch64::STZGloop) { + assert(FIOperandNum == 3 && + "Wrong frame index operand for STGloop/STZGloop"); + unsigned Op = MI.getOpcode() == AArch64::STGloop ? AArch64::STGloop_wback + : AArch64::STZGloop_wback; + ScratchReg = MI.getOperand(1).getReg(); + MI.getOperand(3).ChangeToRegister(ScratchReg, false, false, true); + MI.setDesc(TII->get(Op)); + MI.tieOperands(1, 3); } else { - return MI.getMF()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); + ScratchReg = + MI.getMF()->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + MI.getOperand(FIOperandNum) + .ChangeToRegister(ScratchReg, false, false, true); } + return ScratchReg; } void AArch64RegisterInfo::getOffsetOpcodes( @@ -721,9 +793,9 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - Register ScratchReg = createScratchRegisterForInstruction(MI, TII); + Register ScratchReg = + createScratchRegisterForInstruction(MI, FIOperandNum, TII); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); - MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 0c871ac089a7..12dd70fa4aa8 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -42,8 +42,6 @@ public: void UpdateCustomCallPreservedMask(MachineFunction &MF, const uint32_t **Mask) const; - static bool hasSVEArgsOrReturn(const MachineFunction *MF); - /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const; @@ -120,6 +118,9 @@ public: bool hasBasePointer(const MachineFunction &MF) const; unsigned getBaseRegister() const; + bool isArgumentRegister(const MachineFunction &MF, + MCRegister Reg) const override; + // Debug information queries. Register getFrameRegister(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 70daf5abf81d..7a2b165570cb 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -871,7 +871,7 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size, // SVE predicate register classes. class PPRClass<int lastreg> : RegisterClass< "AArch64", - [ nxv16i1, nxv8i1, nxv4i1, nxv2i1 ], 16, + [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16, (sequence "P%u", 0, lastreg)> { let Size = 16; } @@ -1212,26 +1212,28 @@ let SubRegIndices = [zasubb] in { // SME Register Classes -// Accumulator array -def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> { - let Size = 2048; -} +let isAllocatable = 0 in { + // Accumulator array + def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> { + let Size = 2048; + } -// Accumulator array as single tiles -def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> { - let Size = 2048; -} -def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> { - let Size = 1024; -} -def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> { - let Size = 512; -} -def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> { - let Size = 256; -} -def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> { - let Size = 128; + // Accumulator array as single tiles + def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> { + let Size = 2048; + } + def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> { + let Size = 1024; + } + def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> { + let Size = 512; + } + def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> { + let Size = 256; + } + def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> { + let Size = 128; + } } // SME Register Operands @@ -1385,3 +1387,12 @@ def svcr_op : Operand<i32> { return AArch64SVCR::lookupSVCRByEncoding(MCOp.getImm()) != nullptr; }]; } + +//===----------------------------------------------------------------------===// +// Register categories. +// + +def GeneralPurposeRegisters : RegisterCategory<[GPR64, GPR32]>; + +def FIXED_REGS : RegisterClass<"AArch64", [i64], 64, (add FP, SP, VG, FFR)>; +def FixedRegisters : RegisterCategory<[CCR, FIXED_REGS]>; diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp index c4965e7146ff..364ce687fd55 100644 --- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -360,8 +360,8 @@ AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB, assert(ImpSPOpIdx != -1); int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx); int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx); - BL->RemoveOperand(FirstOpIdxToRemove); - BL->RemoveOperand(SecondOpIdxToRemove); + BL->removeOperand(FirstOpIdxToRemove); + BL->removeOperand(SecondOpIdxToRemove); // Now copy over the implicit operands from the original BLR BL->copyImplicitOps(MF, BLR); MF.moveCallSiteInfo(&BLR, BL); diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index aacace64e998..e595d20c8d4e 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -14,9 +14,18 @@ // Add vector elements horizontally or vertically to ZA tile. //===----------------------------------------------------------------------===// +def SDT_AArch64RDSVL : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>; +def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>; + let Predicates = [HasSME] in { +def RDSVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>; +def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>; +def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>; + def ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha">; def ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva">; + +def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>; } let Predicates = [HasSMEI64] in { @@ -29,41 +38,41 @@ let Predicates = [HasSME] in { // Outer products //===----------------------------------------------------------------------===// -defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa">; -defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops">; +defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa", int_aarch64_sme_mopa_wide>; +defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops", int_aarch64_sme_mops_wide>; -def FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa">; -def FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops">; +defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa", int_aarch64_sme_mopa>; +defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops", int_aarch64_sme_mops>; } let Predicates = [HasSMEF64] in { -def FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa">; -def FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops">; +defm FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa", int_aarch64_sme_mopa>; +defm FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops", int_aarch64_sme_mops>; } let Predicates = [HasSME] in { -defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa">; -defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops">; - -def SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa">; -def SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops">; -def UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa">; -def UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops">; -def SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa">; -def SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops">; -def USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa">; -def USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops">; +defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa", int_aarch64_sme_mopa_wide>; +defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops", int_aarch64_sme_mops_wide>; + +defm SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa", int_aarch64_sme_smopa_wide>; +defm SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops", int_aarch64_sme_smops_wide>; +defm UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa", int_aarch64_sme_umopa_wide>; +defm UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops", int_aarch64_sme_umops_wide>; +defm SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa", int_aarch64_sme_sumopa_wide>; +defm SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops", int_aarch64_sme_sumops_wide>; +defm USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa", int_aarch64_sme_usmopa_wide>; +defm USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops", int_aarch64_sme_usmops_wide>; } let Predicates = [HasSMEI64] in { -def SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa">; -def SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops">; -def UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa">; -def UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops">; -def SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa">; -def SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops">; -def USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa">; -def USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops">; +defm SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa", int_aarch64_sme_smopa_wide>; +defm SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops", int_aarch64_sme_smops_wide>; +defm UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa", int_aarch64_sme_umopa_wide>; +defm UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops", int_aarch64_sme_umops_wide>; +defm SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa", int_aarch64_sme_sumopa_wide>; +defm SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops", int_aarch64_sme_sumops_wide>; +defm USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa", int_aarch64_sme_usmopa_wide>; +defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme_usmops_wide>; } let Predicates = [HasSME] in { @@ -129,15 +138,21 @@ def : InstAlias<"smstop", (MSRpstatesvcrImm1 0b011, 0b0)>; def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>; def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>; +// Read and write TPIDR2_EL0 +def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), + (MSR 0xde85, GPR64:$val)>; +def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), + (MRS 0xde85)>; + //===----------------------------------------------------------------------===// // SVE2 instructions //===----------------------------------------------------------------------===// -def REVD_ZPmZ : sve2_int_perm_revd<"revd">; +defm REVD_ZPmZ : sve2_int_perm_revd<"revd", AArch64revd_mt>; -defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0>; -defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1>; +defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, int_aarch64_sve_sclamp>; +defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, int_aarch64_sve_uclamp>; -defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel">; +defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>; } // End let Predicates = [HasSME] diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 1d162610de9c..68ff1b78e84b 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -165,8 +165,8 @@ def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; def SDT_AArch64Arith : SDTypeProfile<1, 3, [ - SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, - SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3> + SDTCisVec<0>, SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, + SDTCisSameAs<2,3>, SDTCisSameNumEltsAs<0,1> ]>; def SDT_AArch64FMA : SDTypeProfile<1, 4, [ @@ -175,7 +175,6 @@ def SDT_AArch64FMA : SDTypeProfile<1, 4, [ ]>; // Predicated operations with the result of inactive lanes being unspecified. -def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>; def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>; def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>; @@ -194,7 +193,6 @@ def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>; -def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>; def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>; def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; @@ -235,6 +233,7 @@ def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64revh_mt : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64revw_mt : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revd_mt : SDNode<"AArch64ISD::REVD_MERGE_PASSTHRU", SDT_AArch64Arith>; // These are like the above but we don't yet have need for ISD nodes. They allow // a single pattern to match intrinsic and ISD operand layouts. @@ -242,6 +241,26 @@ def AArch64cls_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_ def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>; def AArch64not_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not node:$pt, node:$pg, node:$op)]>; +def AArch64fmul_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fmul, AArch64fmul_p>; +def AArch64fadd_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fadd, AArch64fadd_p>; +def AArch64fsub_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fsub, AArch64fsub_p>; + +def AArch64saba : PatFrags<(ops node:$op1, node:$op2, node:$op3), + [(int_aarch64_sve_saba node:$op1, node:$op2, node:$op3), + (add node:$op1, (AArch64sabd_p (SVEAllActive), node:$op2, node:$op3))]>; + +def AArch64uaba : PatFrags<(ops node:$op1, node:$op2, node:$op3), + [(int_aarch64_sve_uaba node:$op1, node:$op2, node:$op3), + (add node:$op1, (AArch64uabd_p (SVEAllActive), node:$op2, node:$op3))]>; + +def AArch64usra : PatFrags<(ops node:$op1, node:$op2, node:$op3), + [(int_aarch64_sve_usra node:$op1, node:$op2, node:$op3), + (add node:$op1, (AArch64lsr_p (SVEAllActive), node:$op2, (SVEShiftSplatImmR (i32 node:$op3))))]>; + +def AArch64ssra : PatFrags<(ops node:$op1, node:$op2, node:$op3), + [(int_aarch64_sve_ssra node:$op1, node:$op2, node:$op3), + (add node:$op1, (AArch64asr_p (SVEAllActive), node:$op2, (SVEShiftSplatImmR (i32 node:$op3))))]>; + def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCVecEltisVT<1,i1> @@ -282,6 +301,14 @@ def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), def AArch64fabd_p : PatFrag<(ops node:$pg, node:$op1, node:$op2), (AArch64fabs_mt node:$pg, (AArch64fsub_p node:$pg, node:$op1, node:$op2), undef)>; +// FMAs with a negated multiplication operand can be commuted. +def AArch64fmls_p : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), + [(AArch64fma_p node:$pred, (AArch64fneg_mt node:$pred, node:$op1, (undef)), node:$op2, node:$op3), + (AArch64fma_p node:$pred, node:$op2, (AArch64fneg_mt node:$pred, node:$op1, (undef)), node:$op3)]>; + +def AArch64fsubr_p : PatFrag<(ops node:$pg, node:$op1, node:$op2), + (AArch64fsub_p node:$pg, node:$op2, node:$op1)>; + def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt), (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{ return N->getFlags().hasNoSignedZeros(); @@ -295,11 +322,14 @@ def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [ def AArch64bic_node : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>; def AArch64bic : PatFrags<(ops node:$op1, node:$op2), - [(and node:$op1, (xor node:$op2, (AArch64dup (i32 -1)))), - (and node:$op1, (xor node:$op2, (AArch64dup (i64 -1)))), + [(and node:$op1, (xor node:$op2, (splat_vector (i32 -1)))), + (and node:$op1, (xor node:$op2, (splat_vector (i64 -1)))), (and node:$op1, (xor node:$op2, (SVEAllActive))), (AArch64bic_node node:$op1, node:$op2)]>; +def AArch64subr : PatFrag<(ops node:$op1, node:$op2), + (sub node:$op2, node:$op1)>; + let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; @@ -308,7 +338,7 @@ let Predicates = [HasSVE] in { def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>; defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>; defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>; @@ -325,25 +355,27 @@ let Predicates = [HasSVEorStreamingSVE] in { defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; - defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>; - defm SUB_ZPZZ : sve_int_bin_pred_bhsd<AArch64sub_p>; -} // End HasSVEorStreamingSVE + defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", "ORR_ZPZZ", int_aarch64_sve_orr, DestructiveBinaryComm>; + defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", "EOR_ZPZZ", int_aarch64_sve_eor, DestructiveBinaryComm>; + defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", "AND_ZPZZ", int_aarch64_sve_and, DestructiveBinaryComm>; + defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", "BIC_ZPZZ", int_aarch64_sve_bic, DestructiveBinary>; +} // End HasSVEorSME -let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { +let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in { defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>; defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>; defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>; -} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos -let Predicates = [HasSVEorStreamingSVE] in { - defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>; - defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>; - defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>; - defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>; + defm ORR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_orr>; + defm EOR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_eor>; + defm AND_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_and>; + defm BIC_ZPZZ : sve_int_bin_pred_zeroing_bhsd<null_frag>; +} // End HasSVEorSME, UseExperimentalZeroingPseudos +let Predicates = [HasSVEorSME] in { defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>; defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>; - defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>; + defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr", AArch64subr>; defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>; defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>; defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>; @@ -440,11 +472,11 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", "FMINNM_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fminnm>; defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", "FMAX_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmax>; defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", "FMIN_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>; - + defm FADD_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fadd_p>; defm FSUB_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fsub_p>; defm FMUL_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_two, fpimm_half, fpimm_two, AArch64fmul_p>; - defm FSUBR_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one>; + defm FSUBR_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_half_one, fpimm_half, fpimm_one, AArch64fsubr_p>; defm FMAXNM_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmaxnm_p>; defm FMINNM_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fminnm_p>; defm FMAX_ZPZI : sve_fp_2op_i_p_zds_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, AArch64fmax_p>; @@ -461,9 +493,9 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FMIN_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd<sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>; } - defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>; - defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">; - defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>; + defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>; + defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">; + defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>; defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>; defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>; defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>; @@ -484,9 +516,9 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FMIN_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmin_p>; defm FABD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fabd_p>; defm FDIV_ZPZZ : sve_fp_bin_pred_hfd<AArch64fdiv_p>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME -let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { +let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in { defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>; defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>; defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>; @@ -499,28 +531,28 @@ let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>; defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>; defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>; -} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos +} // End HasSVEorSME, UseExperimentalZeroingPseudos -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>; defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>; defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", AArch64frecps>; defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", AArch64frsqrts>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>; defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>; @@ -545,7 +577,7 @@ let Predicates = [HasSVEorStreamingSVE] in { (!cast<Instruction>("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; // Zd = Za + -Zn * Zm - def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)), + def : Pat<(Ty (AArch64fmls_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)), (!cast<Instruction>("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; // Zd = -Za + Zn * Zm @@ -576,26 +608,26 @@ let Predicates = [HasSVEorStreamingSVE] in { defm : fma<nxv4f32, nxv4i1, "S">; defm : fma<nxv2f32, nxv2i1, "S">; defm : fma<nxv2f64, nxv2i1, "D">; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>; defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>; defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>; defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { // SVE floating point reductions. defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>; defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>; defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>; @@ -613,7 +645,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">; // Splat scalar register (unpredicated, GPR or vector + element index) - defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>; + defm DUP_ZR : sve_int_perm_dup_r<"dup", splat_vector>; defm DUP_ZZI : sve_int_perm_dup_i<"dup">; // Splat scalar register (predicated) @@ -621,61 +653,67 @@ let Predicates = [HasSVEorStreamingSVE] in { defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>; // Duplicate FP scalar into all vector elements - def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))), + def : Pat<(nxv8f16 (splat_vector (f16 FPR16:$src))), (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; - def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))), + def : Pat<(nxv4f16 (splat_vector (f16 FPR16:$src))), (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; - def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))), + def : Pat<(nxv2f16 (splat_vector (f16 FPR16:$src))), (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; - def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))), + def : Pat<(nxv4f32 (splat_vector (f32 FPR32:$src))), (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; - def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))), + def : Pat<(nxv2f32 (splat_vector (f32 FPR32:$src))), (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; - def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))), + def : Pat<(nxv2f64 (splat_vector (f64 FPR64:$src))), (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>; - def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))), + def : Pat<(nxv8bf16 (splat_vector (bf16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv4bf16 (splat_vector (bf16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv2bf16 (splat_vector (bf16 FPR16:$src))), (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; // Duplicate +0.0 into all vector elements - def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; - def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; - def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; - def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; - def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; - def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>; - def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv8f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv2f16 (splat_vector (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f32 (splat_vector (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f32 (splat_vector (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f64 (splat_vector (f64 fpimm0))), (DUP_ZI_D 0, 0)>; + def : Pat<(nxv8bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv2bf16 (splat_vector (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; // Duplicate Int immediate into all vector elements - def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + def : Pat<(nxv16i8 (splat_vector (i32 (SVECpyDupImm8Pat i32:$a, i32:$b)))), (DUP_ZI_B $a, $b)>; - def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + def : Pat<(nxv8i16 (splat_vector (i32 (SVECpyDupImm16Pat i32:$a, i32:$b)))), (DUP_ZI_H $a, $b)>; - def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + def : Pat<(nxv4i32 (splat_vector (i32 (SVECpyDupImm32Pat i32:$a, i32:$b)))), (DUP_ZI_S $a, $b)>; - def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm64 i32:$a, i32:$b)))), + def : Pat<(nxv2i64 (splat_vector (i64 (SVECpyDupImm64Pat i32:$a, i32:$b)))), (DUP_ZI_D $a, $b)>; // Duplicate immediate FP into all vector elements. - def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))), + def : Pat<(nxv2f32 (splat_vector (f32 fpimm:$val))), (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>; - def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))), + def : Pat<(nxv4f32 (splat_vector (f32 fpimm:$val))), (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>; - def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))), + def : Pat<(nxv2f64 (splat_vector (f64 fpimm:$val))), (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>; // Duplicate FP immediate into all vector elements let AddedComplexity = 2 in { - def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)), + def : Pat<(nxv8f16 (splat_vector fpimm16:$imm8)), (FDUP_ZI_H fpimm16:$imm8)>; - def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)), + def : Pat<(nxv4f16 (splat_vector fpimm16:$imm8)), (FDUP_ZI_H fpimm16:$imm8)>; - def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)), + def : Pat<(nxv2f16 (splat_vector fpimm16:$imm8)), (FDUP_ZI_H fpimm16:$imm8)>; - def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)), + def : Pat<(nxv4f32 (splat_vector fpimm32:$imm8)), (FDUP_ZI_S fpimm32:$imm8)>; - def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)), + def : Pat<(nxv2f32 (splat_vector fpimm32:$imm8)), (FDUP_ZI_S fpimm32:$imm8)>; - def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)), + def : Pat<(nxv2f64 (splat_vector fpimm64:$imm8)), (FDUP_ZI_D fpimm64:$imm8)>; } @@ -683,13 +721,13 @@ let Predicates = [HasSVEorStreamingSVE] in { defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>; defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; @@ -710,16 +748,21 @@ let Predicates = [HasSVEorStreamingSVE] in { defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>; defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>; + // Define pattern for `nxv1i1 splat_vector(1)`. + // We do this here instead of in ISelLowering such that PatFrag's can still + // recognize a splat. + def : Pat<(nxv1i1 immAllOnesV), (PUNPKLO_PP (PTRUE_D 31))>; + defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">; defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">; def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>; defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>; defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>; @@ -831,7 +874,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>; defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>; defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { // non-faulting continuous load with reg+immediate @@ -871,7 +914,7 @@ let Predicates = [HasSVE] in { defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { // LD(2|3|4) structured loads with reg+immediate defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>; defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>; @@ -899,7 +942,7 @@ let Predicates = [HasSVEorStreamingSVE] in { def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>; def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>; def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { // Gathers using unscaled 32-bit offsets, e.g. @@ -1013,9 +1056,95 @@ let Predicates = [HasSVE] in { defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + + multiclass sve_masked_gather_x2_scaled<ValueType Ty, SDPatternOperator Load, string Inst> { + // base + vector of scaled offsets + def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))), + (!cast<Instruction>(Inst # _SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of signed 32bit scaled offsets + def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))), + (!cast<Instruction>(Inst # _SXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of unsigned 32bit scaled offsets + def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))), + (!cast<Instruction>(Inst # _UXTW_SCALED) PPR:$gp, GPR64:$base, ZPR:$offs)>; + } + + multiclass sve_masked_gather_x2_unscaled<ValueType Ty, SDPatternOperator Load, string Inst, Operand ImmTy> { + // vector of pointers + immediate offset (includes zero) + def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs))), + (!cast<Instruction>(Inst # _IMM) PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>; + // base + vector of offsets + def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs))), + (!cast<Instruction>(Inst) PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of signed 32bit offsets + def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32))), + (!cast<Instruction>(Inst # _SXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of unsigned 32bit offsets + def : Pat<(Ty (Load (SVEDup0Undef), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))), + (!cast<Instruction>(Inst # _UXTW) PPR:$gp, GPR64:$base, ZPR:$offs)>; + } + + multiclass sve_masked_gather_x4<ValueType Ty, SDPatternOperator Load, Instruction Inst> { + def : Pat<(Ty (Load (SVEDup0Undef), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs))), + (Inst PPR:$gp, GPR64:$base, ZPR:$offs)>; + } + + defm : sve_masked_gather_x2_scaled<nxv2i64, azext_masked_gather_i16_signed_scaled, "GLD1H_D">; + defm : sve_masked_gather_x2_scaled<nxv2i64, sext_masked_gather_i16_signed_scaled, "GLD1SH_D">; + defm : sve_masked_gather_x2_scaled<nxv2i64, azext_masked_gather_i32_signed_scaled, "GLD1W_D">; + defm : sve_masked_gather_x2_scaled<nxv2i64, sext_masked_gather_i32_signed_scaled, "GLD1SW_D">; + defm : sve_masked_gather_x2_scaled<nxv2i64, nonext_masked_gather_signed_scaled, "GLD1D">; + defm : sve_masked_gather_x2_scaled<nxv2f16, nonext_masked_gather_signed_scaled, "GLD1H_D">; + defm : sve_masked_gather_x2_scaled<nxv2f32, nonext_masked_gather_signed_scaled, "GLD1W_D">; + defm : sve_masked_gather_x2_scaled<nxv2f64, nonext_masked_gather_signed_scaled, "GLD1D">; + defm : sve_masked_gather_x2_scaled<nxv2bf16, nonext_masked_gather_signed_scaled, "GLD1H_D">; + + defm : sve_masked_gather_x2_unscaled<nxv2i64, azext_masked_gather_i8_signed_unscaled, "GLD1B_D" , imm0_31>; + defm : sve_masked_gather_x2_unscaled<nxv2i64, sext_masked_gather_i8_signed_unscaled, "GLD1SB_D", imm0_31>; + defm : sve_masked_gather_x2_unscaled<nxv2i64, azext_masked_gather_i16_signed_unscaled, "GLD1H_D", uimm5s2>; + defm : sve_masked_gather_x2_unscaled<nxv2i64, sext_masked_gather_i16_signed_unscaled, "GLD1SH_D", uimm5s2>; + defm : sve_masked_gather_x2_unscaled<nxv2i64, azext_masked_gather_i32_signed_unscaled, "GLD1W_D", uimm5s4>; + defm : sve_masked_gather_x2_unscaled<nxv2i64, sext_masked_gather_i32_signed_unscaled, "GLD1SW_D", uimm5s4>; + defm : sve_masked_gather_x2_unscaled<nxv2i64, nonext_masked_gather_signed_unscaled, "GLD1D", uimm5s8>; + defm : sve_masked_gather_x2_unscaled<nxv2f16, nonext_masked_gather_signed_unscaled, "GLD1H_D", uimm5s2>; + defm : sve_masked_gather_x2_unscaled<nxv2f32, nonext_masked_gather_signed_unscaled, "GLD1W_D", uimm5s4>; + defm : sve_masked_gather_x2_unscaled<nxv2f64, nonext_masked_gather_signed_unscaled, "GLD1D", uimm5s8>; + defm : sve_masked_gather_x2_unscaled<nxv2bf16, nonext_masked_gather_signed_unscaled, "GLD1H_D", uimm5s2>; + + defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_signed_scaled, GLD1H_S_SXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_signed_scaled, GLD1SH_S_SXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_signed_scaled, GLD1W_SXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_signed_scaled, GLD1H_S_SXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_signed_scaled, GLD1W_SXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_signed_scaled, GLD1H_S_SXTW_SCALED>; + + defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i8_signed_unscaled, GLD1B_S_SXTW>; + defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i8_signed_unscaled, GLD1SB_S_SXTW>; + defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_signed_unscaled, GLD1H_S_SXTW>; + defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_signed_unscaled, GLD1SH_S_SXTW>; + defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_signed_unscaled, GLD1W_SXTW>; + defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_signed_unscaled, GLD1H_S_SXTW>; + defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_signed_unscaled, GLD1W_SXTW>; + defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_signed_unscaled, GLD1H_S_SXTW>; + + defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_unsigned_scaled, GLD1H_S_UXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_unsigned_scaled, GLD1SH_S_UXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_unsigned_scaled, GLD1W_UXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_unsigned_scaled, GLD1H_S_UXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_unsigned_scaled, GLD1W_UXTW_SCALED>; + defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_unsigned_scaled, GLD1H_S_UXTW_SCALED>; + + defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i8_unsigned_unscaled, GLD1B_S_UXTW>; + defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i8_unsigned_unscaled, GLD1SB_S_UXTW>; + defm : sve_masked_gather_x4<nxv4i32, azext_masked_gather_i16_unsigned_unscaled, GLD1H_S_UXTW>; + defm : sve_masked_gather_x4<nxv4i32, sext_masked_gather_i16_unsigned_unscaled, GLD1SH_S_UXTW>; + defm : sve_masked_gather_x4<nxv4i32, nonext_masked_gather_unsigned_unscaled, GLD1W_UXTW>; + defm : sve_masked_gather_x4<nxv4f16, nonext_masked_gather_unsigned_unscaled, GLD1H_S_UXTW>; + defm : sve_masked_gather_x4<nxv4f32, nonext_masked_gather_unsigned_unscaled, GLD1W_UXTW>; + defm : sve_masked_gather_x4<nxv4bf16, nonext_masked_gather_unsigned_unscaled, GLD1H_S_UXTW>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { // Non-temporal contiguous loads (register + immediate) defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>; defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>; @@ -1051,7 +1180,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>; defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>; defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { // Scatters using unpacked, unscaled 32-bit offsets, e.g. @@ -1100,12 +1229,87 @@ let Predicates = [HasSVE] in { // Scatters using scaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, lsl #1] - defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>; - defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>; - defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>; + defm SST1H_D : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>; + defm SST1W_D : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>; + defm SST1D : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>; + + multiclass sve_masked_scatter_x2_scaled<ValueType Ty, SDPatternOperator Store, string Inst> { + // base + vector of scaled offsets + def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)), + (!cast<Instruction>(Inst # _SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of signed 32bit scaled offsets + def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)), + (!cast<Instruction>(Inst # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of unsigned 32bit scaled offsets + def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))), + (!cast<Instruction>(Inst # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>; + } + + multiclass sve_masked_scatter_x2_unscaled<ValueType Ty, SDPatternOperator Store, string Inst, Operand ImmTy> { + // vector of pointers + immediate offset (includes zero) + def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), (i64 ImmTy:$imm), (nxv2i64 ZPR:$ptrs)), + (!cast<Instruction>(Inst # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, ImmTy:$imm)>; + // base + vector of offsets + def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (nxv2i64 ZPR:$offs)), + (!cast<Instruction>(Inst) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of signed 32bit offsets + def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (sext_inreg (nxv2i64 ZPR:$offs), nxv2i32)), + (!cast<Instruction>(Inst # _SXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>; + // base + vector of unsigned 32bit offsets + def : Pat<(Store (Ty ZPR:$data), (nxv2i1 PPR:$gp), GPR64:$base, (and (nxv2i64 ZPR:$offs), (nxv2i64 (splat_vector (i64 0xFFFFFFFF))))), + (!cast<Instruction>(Inst # _UXTW) ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>; + } + + multiclass sve_masked_scatter_x4<ValueType Ty, SDPatternOperator Store, Instruction Inst> { + def : Pat<(Store (Ty ZPR:$data), (nxv4i1 PPR:$gp), GPR64:$base, (nxv4i32 ZPR:$offs)), + (Inst ZPR:$data, PPR:$gp, GPR64:$base, ZPR:$offs)>; + } + + defm : sve_masked_scatter_x2_scaled<nxv2i64, trunc_masked_scatter_i16_signed_scaled, "SST1H_D">; + defm : sve_masked_scatter_x2_scaled<nxv2i64, trunc_masked_scatter_i32_signed_scaled, "SST1W_D">; + defm : sve_masked_scatter_x2_scaled<nxv2i64, nontrunc_masked_scatter_signed_scaled, "SST1D">; + defm : sve_masked_scatter_x2_scaled<nxv2f16, nontrunc_masked_scatter_signed_scaled, "SST1H_D">; + defm : sve_masked_scatter_x2_scaled<nxv2f32, nontrunc_masked_scatter_signed_scaled, "SST1W_D">; + defm : sve_masked_scatter_x2_scaled<nxv2f64, nontrunc_masked_scatter_signed_scaled, "SST1D">; + defm : sve_masked_scatter_x2_scaled<nxv2bf16, nontrunc_masked_scatter_signed_scaled, "SST1H_D">; + + defm : sve_masked_scatter_x2_unscaled<nxv2i64, trunc_masked_scatter_i8_signed_unscaled, "SST1B_D" , imm0_31>; + defm : sve_masked_scatter_x2_unscaled<nxv2i64, trunc_masked_scatter_i16_signed_unscaled, "SST1H_D", uimm5s2>; + defm : sve_masked_scatter_x2_unscaled<nxv2i64, trunc_masked_scatter_i32_signed_unscaled, "SST1W_D", uimm5s4>; + defm : sve_masked_scatter_x2_unscaled<nxv2i64, nontrunc_masked_scatter_signed_unscaled, "SST1D", uimm5s8>; + defm : sve_masked_scatter_x2_unscaled<nxv2f16, nontrunc_masked_scatter_signed_unscaled, "SST1H_D", uimm5s2>; + defm : sve_masked_scatter_x2_unscaled<nxv2f32, nontrunc_masked_scatter_signed_unscaled, "SST1W_D", uimm5s4>; + defm : sve_masked_scatter_x2_unscaled<nxv2f64, nontrunc_masked_scatter_signed_unscaled, "SST1D", uimm5s8>; + defm : sve_masked_scatter_x2_unscaled<nxv2bf16, nontrunc_masked_scatter_signed_unscaled, "SST1H_D", uimm5s2>; + + defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_signed_scaled, SST1H_S_SXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_signed_scaled, SST1W_SXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_signed_scaled, SST1H_S_SXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_signed_scaled, SST1W_SXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_signed_scaled, SST1H_S_SXTW_SCALED>; + + defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i8_signed_unscaled, SST1B_S_SXTW>; + defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_signed_unscaled, SST1H_S_SXTW>; + defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_signed_unscaled, SST1W_SXTW>; + defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_signed_unscaled, SST1H_S_SXTW>; + defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_signed_unscaled, SST1W_SXTW>; + defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_signed_unscaled, SST1H_S_SXTW>; + + defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_unsigned_scaled, SST1H_S_UXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_unsigned_scaled, SST1W_UXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_unsigned_scaled, SST1H_S_UXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_unsigned_scaled, SST1W_UXTW_SCALED>; + defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_unsigned_scaled, SST1H_S_UXTW_SCALED>; + + defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i8_unsigned_unscaled, SST1B_S_UXTW>; + defm : sve_masked_scatter_x4<nxv4i32, trunc_masked_scatter_i16_unsigned_unscaled, SST1H_S_UXTW>; + defm : sve_masked_scatter_x4<nxv4i32, nontrunc_masked_scatter_unsigned_unscaled, SST1W_UXTW>; + defm : sve_masked_scatter_x4<nxv4f16, nontrunc_masked_scatter_unsigned_unscaled, SST1H_S_UXTW>; + defm : sve_masked_scatter_x4<nxv4f32, nontrunc_masked_scatter_unsigned_unscaled, SST1W_UXTW>; + defm : sve_masked_scatter_x4<nxv4bf16, nontrunc_masked_scatter_unsigned_unscaled, SST1H_S_UXTW>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { // ST(2|3|4) structured stores (register + immediate) defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>; defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>; @@ -1161,7 +1365,7 @@ let Predicates = [HasSVEorStreamingSVE] in { // Contiguous prefetch (register + register) def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>; def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>; - def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>; + def PRFW_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>; def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>; multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, ComplexPattern AddrCP> { @@ -1184,9 +1388,9 @@ let Predicates = [HasSVEorStreamingSVE] in { defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, am_sve_regreg_lsl0>; defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1, PRFH_PRI, PRFH_PRR, am_sve_regreg_lsl1>; - defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFS_PRR, am_sve_regreg_lsl2>; + defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFW_PRR, am_sve_regreg_lsl2>; defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1, PRFD_PRI, PRFD_PRR, am_sve_regreg_lsl3>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { // Gather prefetch using scaled 32-bit offsets, e.g. @@ -1249,7 +1453,7 @@ let Predicates = [HasSVE] in { // Patterns to generate adr instruction. // adr z0.d, [z0.d, z0.d, uxtw] def : Pat<(add nxv2i64:$Op1, - (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))))), + (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))))), (ADR_UXTW_ZZZ_D_0 $Op1, $Op2)>; // adr z0.d, [z0.d, z0.d, sxtw] def : Pat<(add nxv2i64:$Op1, @@ -1262,7 +1466,7 @@ let Predicates = [HasSVE] in { def : Pat<(add Ty:$Op1, (Ty (AArch64lsl_p (PredTy (SVEAllActive)), Ty:$Op2, - (Ty (AArch64dup (ShiftTy ShiftAmt)))))), + (Ty (splat_vector (ShiftTy ShiftAmt)))))), (DestAdrIns $Op1, $Op2)>; } defm : adrShiftPat<nxv2i64, nxv2i1, i64, ADR_LSL_ZZZ_D_1, 1>; @@ -1277,14 +1481,14 @@ let Predicates = [HasSVE] in { multiclass adrXtwShiftPat<ValueType Ty, ValueType PredTy, int ShiftAmt> { def : Pat<(add Ty:$Op1, (Ty (AArch64lsl_p (PredTy (SVEAllActive)), - (Ty (and Ty:$Op2, (Ty (AArch64dup (i64 0xFFFFFFFF))))), - (Ty (AArch64dup (i64 ShiftAmt)))))), + (Ty (and Ty:$Op2, (Ty (splat_vector (i64 0xFFFFFFFF))))), + (Ty (splat_vector (i64 ShiftAmt)))))), (!cast<Instruction>("ADR_UXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>; def : Pat<(add Ty:$Op1, (Ty (AArch64lsl_p (PredTy (SVEAllActive)), (Ty (sext_inreg Ty:$Op2, nxv2i32)), - (Ty (AArch64dup (i64 ShiftAmt)))))), + (Ty (splat_vector (i64 ShiftAmt)))))), (!cast<Instruction>("ADR_SXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>; } defm : adrXtwShiftPat<nxv2i64, nxv2i1, 1>; @@ -1292,7 +1496,7 @@ let Predicates = [HasSVE] in { defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; @@ -1310,6 +1514,10 @@ let Predicates = [HasSVEorStreamingSVE] in { defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>; // Extract lo/hi halves of legal predicate types. + def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP PPR:$Ps)>; + def : Pat<(nxv1i1 (extract_subvector (nxv2i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP PPR:$Ps)>; def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP PPR:$Ps)>; def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), @@ -1400,6 +1608,8 @@ let Predicates = [HasSVEorStreamingSVE] in { (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>; // Concatenate two predicates. + def : Pat<(nxv2i1 (concat_vectors nxv1i1:$p1, nxv1i1:$p2)), + (UZP1_PPP_D $p1, $p2)>; def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)), (UZP1_PPP_S $p1, $p2)>; def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)), @@ -1475,7 +1685,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>; defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>; defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETONE, SETNE, SETONE, SETNE>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETUNE, SETNE, SETUNE, SETNE>; defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", SETUO, SETUO, SETUO, SETUO>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; @@ -1485,7 +1695,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt", SETOLT, SETLT, SETOGT, SETGT>; defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle", SETOLE, SETLE, SETOGE, SETGE>; defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>; - defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETONE, SETNE, SETONE, SETNE>; + defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETUNE, SETNE, SETUNE, SETNE>; defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>; defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>; @@ -1522,7 +1732,7 @@ let Predicates = [HasSVEorStreamingSVE] in { defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd", add, int_aarch64_sve_cntd>; defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd", sub, int_aarch64_sve_cntd>; -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>; defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>; defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>; @@ -1619,16 +1829,16 @@ let Predicates = [HasSVEorStreamingSVE] in { defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME -let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { +let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in { defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>; defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>; defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>; defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<AArch64asrd_m1>; -} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos +} // End HasSVEorSME, UseExperimentalZeroingPseudos -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">; defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">; defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">; @@ -1679,60 +1889,61 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; - def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))), - (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + //These patterns exist to improve the code quality of conversions on unpacked types. + def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))), + (FCVT_ZPmZ_HtoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; // FP_ROUND has an additional 'precise' flag which indicates the type of rounding. // This is ignored by the pattern below where it is matched by (i64 timm0_1) - def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))), - (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))), + (FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - // Floating-point -> signed integer - def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + // Signed integer -> Floating-point + def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))), - (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 PPR:$Pg), + def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 (SVEAllActive):$Pg), (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))), - (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))), - (SCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (SCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))), - (SCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (SCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))), - (SCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (SCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - // Floating-point -> unsigned integer - def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + // Unsigned integer -> Floating-point + def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))), - (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (nxv2i64 (splat_vector (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))), + (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))), - (UCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))), + (UCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 PPR:$Pg), + def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 (SVEAllActive):$Pg), (and (nxv4i32 ZPR:$Zs), - (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))), - (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (nxv4i32 (splat_vector (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))), + (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))), - (UCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))), + (UCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))), - (UCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + (nxv2i64 (splat_vector (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))), + (UCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>; defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>; @@ -1743,27 +1954,27 @@ let Predicates = [HasSVEorStreamingSVE] in { defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>; defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>; defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME -let Predicates = [HasBF16, HasSVEorStreamingSVE] in { +let Predicates = [HasBF16, HasSVEorSME] in { defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>; defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>; -} // End HasBF16, HasSVEorStreamingSVE +} // End HasBF16, HasSVEorSME let Predicates = [HasBF16, HasSVE] in { defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>; } // End HasBF16, HasSVE -let Predicates = [HasBF16, HasSVEorStreamingSVE] in { - defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>; - defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>; - defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>; - defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>; +let Predicates = [HasBF16, HasSVEorSME] in { + defm BFMLALB_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>; + defm BFMLALT_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>; + defm BFMLALB_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>; + defm BFMLALT_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>; defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>; defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>; -} // End HasBF16, HasSVEorStreamingSVE +} // End HasBF16, HasSVEorSME -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { // InstAliases def : InstAlias<"mov $Zd, $Zn", (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>; @@ -1875,7 +2086,7 @@ let Predicates = [HasSVEorStreamingSVE] in { let AddedComplexity = 1 in { class LD1RPat<ValueType vt, SDPatternOperator operator, Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> : - Pat<(vt (AArch64dup (index_vt (operator (CP GPR64:$base, immtype:$offset))))), + Pat<(vt (splat_vector (index_vt (operator (CP GPR64:$base, immtype:$offset))))), (load (ptrue 31), GPR64:$base, $offset)>; } @@ -1963,22 +2174,22 @@ let Predicates = [HasSVEorStreamingSVE] in { GPR32:$op, sub_32), $imm), sub_32))>; - def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))), + def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))), (INCH_ZPiI ZPR:$op, 31, $imm)>; - def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))), + def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (splat_vector (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))), (INCW_ZPiI ZPR:$op, 31, $imm)>; - def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))), + def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (splat_vector (i64 (vscale (sve_cntd_imm i32:$imm))))))), (INCD_ZPiI ZPR:$op, 31, $imm)>; - def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))), + def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))), (DECH_ZPiI ZPR:$op, 31, $imm)>; - def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))), + def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (splat_vector (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))), (DECW_ZPiI ZPR:$op, 31, $imm)>; - def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))), + def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (splat_vector (i64 (vscale (sve_cntd_imm i32:$imm))))))), (DECD_ZPiI ZPR:$op, 31, $imm)>; } - let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL], AddedComplexity = 5 in { + let Predicates = [HasSVEorSME, UseScalarIncVL], AddedComplexity = 5 in { def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))), (INCH_XPiI GPR64:$op, 31, $imm)>; def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))), @@ -2098,15 +2309,23 @@ let Predicates = [HasSVEorStreamingSVE] in { def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv1i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv1i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv1i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv1i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv1i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; // These allow casting from/to unpacked floating-point types. def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; @@ -2145,12 +2364,12 @@ let Predicates = [HasSVEorStreamingSVE] in { } // 2-element contiguous loads - defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>; - defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>; - defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>; - defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>; - defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>; - defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>; + defm : pred_load<nxv2i64, nxv2i1, azext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv2i64, nxv2i1, sext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv2i64, nxv2i1, azext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv2i64, nxv2i1, sext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv2i64, nxv2i1, azext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>; + defm : pred_load<nxv2i64, nxv2i1, sext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>; defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>; defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>; defm : pred_load<nxv2bf16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>; @@ -2158,18 +2377,18 @@ let Predicates = [HasSVEorStreamingSVE] in { defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>; // 4-element contiguous loads - defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>; - defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>; - defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>; - defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv4i32, nxv4i1, azext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv4i32, nxv4i1, sext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv4i32, nxv4i1, azext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv4i32, nxv4i1, sext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>; defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>; defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>; defm : pred_load<nxv4bf16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>; defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>; // 8-element contiguous loads - defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>; - defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv8i16, nxv8i1, azext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>; + defm : pred_load<nxv8i16, nxv8i1, sext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>; defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; @@ -2397,7 +2616,7 @@ let Predicates = [HasSVEorStreamingSVE] in { // 16-element contiguous loads defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE] in { multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> { @@ -2482,7 +2701,7 @@ let Predicates = [HasSVE] in { defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; } // End HasSVE -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty, SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> { // reg + reg @@ -2716,7 +2935,7 @@ let Predicates = [HasSVEorStreamingSVE] in { def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; } -} // End HasSVEorStreamingSVE +} // End HasSVEorSME let Predicates = [HasSVE, HasMatMulInt8] in { defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>; @@ -2724,11 +2943,11 @@ let Predicates = [HasSVE, HasMatMulInt8] in { defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>; } // End HasSVE, HasMatMulInt8 -let Predicates = [HasSVEorStreamingSVE, HasMatMulInt8] in { +let Predicates = [HasSVEorSME, HasMatMulInt8] in { defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>; defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>; defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>; -} // End HasSVEorStreamingSVE, HasMatMulInt8 +} // End HasSVEorSME, HasMatMulInt8 let Predicates = [HasSVE, HasMatMulFP32] in { defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>; @@ -2746,16 +2965,16 @@ let Predicates = [HasSVE, HasMatMulFP64] in { defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>; } // End HasSVE, HasMatMulFP64 -let Predicates = [HasSVEorStreamingSVE, HasMatMulFP64] in { +let Predicates = [HasSVEorSME, HasMatMulFP64] in { defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>; defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>; defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>; defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>; defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>; defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>; -} // End HasSVEorStreamingSVE, HasMatMulFP64 +} // End HasSVEorSME, HasMatMulFP64 -let Predicates = [HasSVE2orStreamingSVE] in { +let Predicates = [HasSVE2orSME] in { // SVE2 integer multiply-add (indexed) defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>; defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>; @@ -2903,17 +3122,17 @@ let Predicates = [HasSVE2orStreamingSVE] in { defm UQSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqshl>; defm SQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_sqrshl>; defm UQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd<int_aarch64_sve_uqrshl>; -} // End HasSVE2orStreamingSVE +} // End HasSVE2orSME -let Predicates = [HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos] in { +let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in { defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>; defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>; defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>; defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>; defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>; -} // End HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos +} // End HasSVE2orSME, UseExperimentalZeroingPseudos -let Predicates = [HasSVE2orStreamingSVE] in { +let Predicates = [HasSVE2orSME] in { // SVE2 predicated shifts defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl", "SQSHL_ZPZI", int_aarch64_sve_sqshl>; defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl", "UQSHL_ZPZI", int_aarch64_sve_uqshl>; @@ -2960,18 +3179,18 @@ let Predicates = [HasSVE2orStreamingSVE] in { defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>; // SVE2 bitwise shift right and accumulate - defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>; - defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>; - defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>; - defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>; + defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", AArch64ssra>; + defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", AArch64usra>; + defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra, int_aarch64_sve_srshr>; + defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra, int_aarch64_sve_urshr>; // SVE2 complex integer add defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>; defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>; // SVE2 integer absolute difference and accumulate - defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>; - defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>; + defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", AArch64saba>; + defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", AArch64uaba>; // SVE2 integer absolute difference and accumulate long defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>; @@ -3026,7 +3245,7 @@ let Predicates = [HasSVE2orStreamingSVE] in { defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>; defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>; defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>; -} // End HasSVE2orStreamingSVE +} // End HasSVE2orSME let Predicates = [HasSVE2] in { // SVE2 character match @@ -3034,7 +3253,7 @@ let Predicates = [HasSVE2] in { defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>; } // End HasSVE2 -let Predicates = [HasSVE2orStreamingSVE] in { +let Predicates = [HasSVE2orSME] in { // SVE2 bitwise exclusive-or interleaved defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>; defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>; @@ -3049,7 +3268,7 @@ let Predicates = [HasSVE2orStreamingSVE] in { defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>; defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>; defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>; -} // End HasSVE2orStreamingSVE +} // End HasSVE2orSME let Predicates = [HasSVE2] in { // SVE2 histogram generation (segment) @@ -3059,7 +3278,7 @@ let Predicates = [HasSVE2] in { defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>; } // End HasSVE2 -let Predicates = [HasSVE2orStreamingSVE] in { +let Predicates = [HasSVE2orSME] in { // SVE2 floating-point base 2 logarithm as integer defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>; @@ -3091,7 +3310,7 @@ let Predicates = [HasSVE2orStreamingSVE] in { // SVE2 bitwise ternary operations defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>; defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>; - defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>; + defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl, AArch64bsp>; defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>; defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>; @@ -3101,7 +3320,7 @@ let Predicates = [HasSVE2orStreamingSVE] in { // SVE2 extract vector (immediate offset, constructive) def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; -} // End HasSVE2orStreamingSVE +} // End HasSVE2orSME let Predicates = [HasSVE2] in { // SVE2 non-temporal gather loads @@ -3120,10 +3339,10 @@ let Predicates = [HasSVE2] in { defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>; } // End HasSVE2 -let Predicates = [HasSVE2orStreamingSVE] in { +let Predicates = [HasSVE2orSME] in { // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; -} // End HasSVE2orStreamingSVE +} // End HasSVE2orSME let Predicates = [HasSVE2] in { // SVE2 non-temporal scatter stores @@ -3137,7 +3356,7 @@ let Predicates = [HasSVE2] in { defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>; } // End HasSVE2 -let Predicates = [HasSVE2orStreamingSVE] in { +let Predicates = [HasSVE2orSME] in { // SVE2 table lookup (three sources) defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>; defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>; @@ -3156,7 +3375,7 @@ let Predicates = [HasSVE2orStreamingSVE] in { // SVE2 pointer conflict compare defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">; defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">; -} // End HasSVE2orStreamingSVE +} // End HasSVE2orSME let Predicates = [HasSVE2AES] in { // SVE2 crypto destructive binary operations diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td index 009219ce3c54..c6b112d0d2f1 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// // -// This file defines the machine model for the ARM Cortex-A55 processors. +// This file defines the machine model for the ARM Cortex-A55 processors. Note +// that this schedule is currently used as the default for -mcpu=generic. As a +// result, some of the modelling decision made do not precisely model the +// Cortex-A55, instead aiming to be a good compromise between different cpus. // //===----------------------------------------------------------------------===// @@ -149,8 +152,31 @@ def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; } def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; } def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; } def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; } -def : WriteRes<WriteVd, [CortexA55UnitFPALU]> { let Latency = 4; } -def : WriteRes<WriteVq, [CortexA55UnitFPALU,CortexA55UnitFPALU]> { let Latency = 4; let BeginGroup = 1; } + +// NEON +class CortexA55WriteVd<int n, ProcResourceKind res> : SchedWriteRes<[res]> { + let Latency = n; +} +class CortexA55WriteVq<int n, ProcResourceKind res> : SchedWriteRes<[res, res]> { + let Latency = n; + let BeginGroup = 1; +} +def CortexA55WriteDotScVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>; +def CortexA55WriteDotVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>; +def CortexA55WriteDotVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>; +def CortexA55WriteMlaLVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>; +def CortexA55WriteMlaIxVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>; +def CortexA55WriteMlaVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>; +def CortexA55WriteMlaVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>; +def CortexA55WriteAluVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>; +def CortexA55WriteAluVd_3 : CortexA55WriteVd<3, CortexA55UnitFPALU>; +def CortexA55WriteAluVq_3 : CortexA55WriteVq<3, CortexA55UnitFPALU>; +def CortexA55WriteAluVd_2 : CortexA55WriteVd<2, CortexA55UnitFPALU>; +def CortexA55WriteAluVq_2 : CortexA55WriteVq<2, CortexA55UnitFPALU>; +def CortexA55WriteAluVd_1 : CortexA55WriteVd<1, CortexA55UnitFPALU>; +def CortexA55WriteAluVq_1 : CortexA55WriteVq<1, CortexA55UnitFPALU>; +def : SchedAlias<WriteVd, CortexA55WriteVd<4, CortexA55UnitFPALU>>; +def : SchedAlias<WriteVq, CortexA55WriteVq<4, CortexA55UnitFPALU>>; // FP ALU specific new schedwrite definitions def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;} @@ -358,4 +384,99 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +// 4.15. Advanced SIMD integer instructions +// ASIMD absolute diff +def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]ABDv(2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDv(16i8|4i32|8i16)")>; +// ASIMD absolute diff accum +def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ABAL?v")>; +// ASIMD absolute diff long +def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDLv")>; +// ASIMD arith #1 +def : InstRW<[CortexA55WriteAluVd_2], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)", + "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_2], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)", + "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>; +// ASIMD arith #2 +def : InstRW<[CortexA55WriteAluVd_3], (instregex "ABSv(1i64|2i32|4i16|8i8)$", + "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$", + "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$", + "ADDPv(2i32|4i16|8i8)$")>; +def : InstRW<[CortexA55WriteAluVq_3], (instregex "ABSv(2i64|4i32|8i16|16i8)$", + "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$", + "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$", + "ADDPv(16i8|2i64|4i32|8i16)$")>; +// ASIMD arith #3 +def : InstRW<[CortexA55WriteAluVq_3], (instregex "SADDLv", "UADDLv", "SADDWv", + "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>; +// ASIMD arith #5 +def : InstRW<[CortexA55WriteAluVq_4], (instregex "RADDHNv", "RSUBHNv")>; +// ASIMD arith, reduce +def : InstRW<[CortexA55WriteAluVq_3], (instregex "ADDVv", "SADDLVv", "UADDLVv")>; +// ASIMD compare #1 +def : InstRW<[CortexA55WriteAluVd_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>; +// ASIMD compare #2 +def : InstRW<[CortexA55WriteAluVd_3], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_3], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>; +// ASIMD logical $1 +def : InstRW<[CortexA55WriteAluVd_1], (instregex "(AND|EOR|NOT|ORN)v8i8", + "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>; +def : InstRW<[CortexA55WriteAluVq_1], (instregex "(AND|EOR|NOT|ORN)v16i8", + "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>; +// ASIMD max/min, basic +def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>; +// SIMD max/min, reduce +def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU](MAX|MIN)Vv")>; +// ASIMD multiply, by element +def : InstRW<[CortexA55WriteAluVq_4], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$", + "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>; +// ASIMD multiply +def : InstRW<[CortexA55WriteAluVd_3], (instrs PMULv8i8)>; +def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULv16i8)>; +// ASIMD multiply accumulate +def : InstRW<[CortexA55WriteMlaVd_4], (instregex "ML[AS]v(2i32|4i16|8i8)$")>; +def : InstRW<[CortexA55WriteMlaVq_4], (instregex "ML[AS]v(16i8|4i32|8i16)$")>; +def : InstRW<[CortexA55WriteMlaIxVq_4], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>; +// ASIMD multiply accumulate half +def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQRDML[AS]H[vi]")>; +// ASIMD multiply accumulate long +def : InstRW<[CortexA55WriteMlaLVq_4], (instregex "[SU]ML[AS]Lv")>; +// ASIMD multiply accumulate long #2 +def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQDML[AS]L[iv]")>; +// ASIMD dot product +def : InstRW<[CortexA55WriteDotVd_4], (instregex "[SU]DOTv8i8")>; +def : InstRW<[CortexA55WriteDotVq_4], (instregex "[SU]DOTv16i8")>; +// ASIMD dot product, by scalar +def : InstRW<[CortexA55WriteDotScVq_4], (instregex "[SU]DOTlanev")>; +// ASIMD multiply long +def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]MULLv", "SQDMULL[iv]")>; +// ASIMD polynomial (8x8) multiply long +def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULLv8i8, PMULLv16i8)>; +// ASIMD pairwise add and accumulate +def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]ADALPv")>; +// ASIMD shift accumulate +def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; +def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; +// ASIMD shift accumulate #2 +def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]RSRA[vd]")>; +// ASIMD shift by immed +def : InstRW<[CortexA55WriteAluVd_2], (instregex "SHLd$", "SHLv", + "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>; +// ASIMD shift by immed +// SXTL and UXTL are aliases for SHLL +def : InstRW<[CortexA55WriteAluVq_2], (instregex "[US]?SHLLv")>; +// ASIMD shift by immed #2 +def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)", + "RSHRNv(2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHRv(16i8|2i64|4i32|8i16)", + "RSHRNv(16i8|4i32|8i16)")>; +// ASIMD shift by register +def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>; +// ASIMD shift by register #2 +def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>; +def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>; + } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td index fa10d056b7f7..6b053f1969b4 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td @@ -22,7 +22,7 @@ def A64FXModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth, - HasSVE2orStreamingSVE]; + HasSVE2orSME]; let FullInstRWOverlapCheck = 0; } @@ -3348,7 +3348,7 @@ def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>; def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>; // [351] "prfw $prfop, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRR)>; // [352] "prfw $prfop, $Pg, [$Rn, $Zm]"; def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>; @@ -3554,7 +3554,7 @@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>; // [421] "st1b $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>; // [422] "st1b $Zt, $Pg, [$Rn, $imm4, mul vl]"; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>; @@ -3566,7 +3566,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>; // [425] "st1d $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D, SST1D_SCALED, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>; // [426] "st1d $Zt, $Pg, [$Rn, $imm4, mul vl]"; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>; @@ -3578,7 +3578,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>; // [429] "st1h $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D, SST1H_D_SCALED, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>; // [430] "st1h $Zt, $Pg, [$Rn, $imm4, mul vl]"; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>; @@ -3590,7 +3590,7 @@ def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>; // [433] "st1w $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D, SST1W_D_SCALED, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>; // [434] "st1w $Zt, $Pg, [$Rn, $imm4, mul vl]"; def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td new file mode 100644 index 000000000000..32f7299fbf87 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td @@ -0,0 +1,1136 @@ +//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Ampere Computing Ampere-1 to +// support instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// The Ampere-1 core is an out-of-order micro-architecture. The front +// end has branch prediction, with a 10-cycle recovery time from a +// mispredicted branch. Instructions coming out of the front end are +// decoded into internal micro-ops (uops). + +def Ampere1Model : SchedMachineModel { + let IssueWidth = 4; // 4-way decode and dispatch + let MicroOpBufferSize = 174; // micro-op re-order buffer size + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 10; // Branch mispredict penalty + let LoopMicroOpBufferSize = 32; // Instruction queue size + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F); +} + +let SchedModel = Ampere1Model in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Ampere-1. +// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, +// and 2 memory) issue into. The integer and FP schedulers can each issue +// one uop per cycle, while the memory schedulers can each issue one load +// and one store address calculation per cycle. + +def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w +def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts +def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle +def Ampere1UnitL : ProcResource<2>; // load +def Ampere1UnitS : ProcResource<2>; // store address calculation +def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write +def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto +def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves + +def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>; +def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>; + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Ampere-1. + +def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS, + Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 1; +} + +def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 7; + let NumMicroOps = 1; +} + +def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 4; +} + +def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 7; + let NumMicroOps = 12; +} + +def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA, + Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 5; +} + +def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 14; +} + +def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 16; +} + +def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 6; +} + +def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> { + let Latency = 11; + let NumMicroOps = 2; +} + +def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 3; +} + +def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 4; +} + +def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 18; + let NumMicroOps = 1; +} + +def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 19; + let NumMicroOps = 1; +} + +def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 25; + let NumMicroOps = 1; +} + +def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 32; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 39; + let NumMicroOps = 1; +} + +def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 62; + let NumMicroOps = 1; +} + +// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), +// which are a single uop, and for extended registers, which have full flexibility +// across Unit A or B for both uops. +def Ampere1Write_Arith : SchedWriteVariant<[ + SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>, + SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1AB]>, + SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1AB]>]>; + +def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[ + SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>, + SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1A]>, + SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1A]>]>; + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latencies for Ampere-1. +// This provides a coarse model, which is then specialised below. + +def : WriteRes<WriteImm, [Ampere1UnitAB]>; // MOVN, MOVZ +def : WriteRes<WriteI, [Ampere1UnitAB]>; // ALU +def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Shifted-Reg +def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Extended-Reg +def : WriteRes<WriteExtr, [Ampere1UnitB]>; // EXTR shifts a reg pair +def : WriteRes<WriteIS, [Ampere1UnitB]>; // Shift/Scale +def : WriteRes<WriteID32, [Ampere1UnitBS]> { + let Latency = 18; +} // 32-bit Divide +def : WriteRes<WriteID64, [Ampere1UnitBS]> { + let Latency = 34; +} // 64-bit Divide +def : WriteRes<WriteIM32, [Ampere1UnitBS]> { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes<WriteIM64, [Ampere1UnitBS]> { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes<WriteBr, [Ampere1UnitA]>; +def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>; +def : WriteRes<WriteLD, [Ampere1UnitL]> { + let Latency = 4; +} // Load from base addr plus immediate offset +def : WriteRes<WriteST, [Ampere1UnitS]> { + let Latency = 1; +} // Store to base addr plus immediate offset +def : WriteRes<WriteSTP, [Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} // Store a register pair. +def : WriteRes<WriteAdr, [Ampere1UnitAB]>; +def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> { + let Latency = 5; + let NumMicroOps = 2; +} // Load from a register index (maybe scaled). +def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} // Store to a register index (maybe scaled). +def : WriteRes<WriteF, [Ampere1UnitXY]> { + let Latency = 2; +} // General floating-point ops. +def : WriteRes<WriteFCmp, [Ampere1UnitX]> { + let Latency = 5; +} // Floating-point compare. +def : WriteRes<WriteFCvt, [Ampere1UnitXY]> { + let Latency = 6; +} // Float conversion. +def : WriteRes<WriteFCopy, [Ampere1UnitXY]> { +} // Float-int register copy. +def : WriteRes<WriteFImm, [Ampere1UnitXY]> { + let Latency = 2; +} // Float-int register copy. +def : WriteRes<WriteFMul, [Ampere1UnitXY]> { + let Latency = 5; +} // Floating-point multiply. +def : WriteRes<WriteFDiv, [Ampere1UnitXY]> { + let Latency = 34; +} // Floating-point division. +def : WriteRes<WriteVd, [Ampere1UnitXY]> { + let Latency = 3; +} // 64bit Vector D ops. +def : WriteRes<WriteVq, [Ampere1UnitXY]> { + let Latency = 3; +} // 128bit Vector Q ops. +def : WriteRes<WriteVLD, [Ampere1UnitL, Ampere1UnitL]> { + let Latency = 5; +} // Vector loads. +def : WriteRes<WriteVST, [Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 2; +} // Vector stores. + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteLDHi, []> { + let Latency = 4; +} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP + +// Forwarding logic. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 1, [WriteIM32, WriteIM64]>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadST, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +//===----------------------------------------------------------------------===// +// Specialising the scheduling model further for Ampere-1. + +def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>; + +// Branch instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>; + +// Cryptography instructions +// -- AES encryption/decryption +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>; +// -- Polynomial multiplication +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; +// -- SHA-256 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>; +// -- SHA-256 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>; +// -- SHA-3 instructions +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; +// -- SHA-512 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>; +// -- SHA-512 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>; +// -- SHA1 choose/majority/parity +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>; +// -- SHA1 hash/schedule update +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>; +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>; + +// FP and vector load instructions +// -- Load 1-element structure to one/all lanes +// ---- all lanes +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// ---- one lane +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1i(8|16|32|64)")>; +// -- Load 1-element structure to one/all lanes, 1D size +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Rv1d")>; +// -- Load 1-element structures to 1 register +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 2 registers +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 3 registers +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 2-element structure to all lanes of 2 registers, 1D size +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD2Rv1d")>; +// -- Load 2-element structure to all lanes of 2 registers, other sizes +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 2-element structure to one lane of 2 registers +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2i(8|16|32|64)")>; +// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Twov(16b|8h|4s|2d)")>; +// -- Load 2-element structures to 2 registers, 8B/4H/2S size +def : InstRW<[Ampere1Write_9cyc_2L_3XY], + (instregex "^LD2Twov(8b|4h|2s)")>; +// -- Load 3-element structure to all lanes of 3 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD3Rv1d")>; +// -- Load 3-element structure to all lanes of 3 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 3-element structure to one lane of 3 registers +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3i(8|16|32|64)")>; +// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_3L_3XY], + (instregex "^LD3Threev(16b|8h|4s)")>; +// -- Load 3-element structures to 3 registers, 2D size +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Threev2d")>; +// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_10cyc_3L_3XY], + (instregex "^LD3Threev(8b|4h|2s)")>; +// -- Load 4-element structure to all lanes of 4 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4Rv1d")>; +// -- Load 4-element structure to all lanes of 4 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_4L_4XY], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 4-element structure to one lane of 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4i(8|16|32|64)")>; +// -- Load 4-element structures to 4 registers, 2D size +def : InstRW<[Ampere1Write_9cyc_4L_4XY], + (instregex "^LD4Fourv2d")>; +// -- Load 4-element structures to 4 registers, 2S size +def : InstRW<[Ampere1Write_12cyc_4L_8XY], + (instregex "^LD4Fourv2s")>; +// -- Load 4-element structures to 4 registers, other sizes +def : InstRW<[Ampere1Write_11cyc_4L_8XY], + (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; +// -- Load pair, Q-form +def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>; +// -- Load pair, S/D-form +def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; +// -- Load register +def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>; +// -- Load register, sign-extended register +def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; + +// FP and vector store instructions +// -- Store 1-element structure from one lane of 1 register +def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z], + (instregex "^ST1i(8|16|32|64)")>; +// -- Store 1-element structures from 1 register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 2 registers +def : InstRW<[Ampere1Write_3cyc_2S_2Z], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 3 registers +def : InstRW<[Ampere1Write_4cyc_3S_3Z], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 4 registers +def : InstRW<[Ampere1Write_5cyc_4S_4Z], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 2-element structure from one lane of 2 registers +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2i(8|16|32|64)")>; +// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2Twov(16b|8h|4s|2d)")>; +// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z], + (instregex "^ST2Twov(8b|4h|2s)")>; +// -- Store 3-element structure from one lane of 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3i(8|16|32|64)")>; +// -- Store 3-element structures from 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 4-element structure from one lane of 4 registers +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4i(8|16|32|64)")>; +// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z], + (instregex "^ST4Fourv(16b|8h|4s)")>; +// -- Store 4-element structures from 4 registers, 2D sizes +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv2d")>; +// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z], + (instregex "^ST4Fourv(8b|4h|2s)")>; +// -- Store pair, Q-form +def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>; +// -- Store pair, S/D-form +def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>; +// -- Store register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; +// -- Store register, sign-extended register offset +def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; + +// FP data processing, bfloat16 format +def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>; +def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; +def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>; + +// FP data processing, scalar/vector, half precision +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; +def : InstRW<[Ampere1Write_4cyc_1X], + (instregex "^FCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X], + (instregex "^FCCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY], + (instregex "^FCSELH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>; +def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; +def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; +def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>; + +// FP data processing, scalar/vector, single/double precision +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1X], + (instregex "^FCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X], + (instregex "^FCCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY], + (instregex "^FCSEL(S|D)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>; +def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>; +def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>; +def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>; +def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>; + +// FP miscellaneous instructions +def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>; +def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>; +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>; +def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; + +// Integer arithmetic and logical instructions +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "ADC(W|X)r", "SBC(W|X)r")>; +def : InstRW<[Ampere1Write_Arith], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>; +def : InstRW<[Ampere1Write_ArithFlagsetting], + (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(ADC|SBC)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CCMN|CCMP)(X|W)")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; +def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>; +def : InstRW<[Ampere1Write_3cyc_1BS], + (instregex "(S|U)MULHr")>; +def : InstRW<[Ampere1Write_4cyc_1BS], + (instregex "(S|U)?M(ADD|SUB)L?r")>; + +// Integer load instructions +def : InstRW<[Ampere1Write_4cyc_2L], + (instregex "(LDNP|LDP|LDPSW)(X|W)")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(B|D|H|Q|S)ui")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(D|Q|W|X)l")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTR(B|H|W|X)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDURS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_5cyc_1AB_1L], + (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1L], + (instrs PRFMl, PRFUMi, PRFUMi)>; +def : InstRW<[Ampere1Write_2cyc_1AB_1L], + (instrs PRFMroW, PRFMroX)>; + +// Integer miscellaneous instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; +def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>; +def : InstRW<[Ampere1Write_1cyc_1AB], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; + +// Integer store instructions +def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>; +def : InstRW<[Ampere1Write_2cyc_1B_1S], + (instrs STPWi, STPXi)>; +def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB], + (instregex "STP(W|X)(pre|post)")>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instregex "STUR(BB|HH|X|W)i", + "STR(X|W)ui", + "STUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>; +def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>; + +// Pointer authentication +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>; +def : InstRW<[Ampere1Write_8cyc_1BS_1A], + (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; +def : InstRW<[Ampere1Write_8cyc_1BS_2A], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>; +def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>; +def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>; + +// Vector integer instructions +// -- absolute difference +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", + "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; +// -- arithmetic +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", + "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", + "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; +// -- arithmetic, horizontal, 16B +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^[SU](MIN|MAX)Vv16i8v")>; +// -- arithmetic, horizontal, 4H/4S +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; +// -- arithmetic, horizontal, 8B/8H +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; +// -- arithmetic, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; +// -- arithmetic, pairwise +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; +// -- arithmetic, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; +// -- bit count +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^(CLS|CLZ|CNT)v")>; +// -- compare +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", + "^CMHIv", "^CMHSv")>; +// -- compare non-zero +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>; +// -- dot product +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; +// -- fp reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>; +// -- integer reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; +// -- logical +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; +// -- logical, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], + (instregex "RSHRNv", + "SHRNv", "SQSHRNv", "SQSHRUNv", + "UQXTNv")>; +// -- matrix multiply +def : InstRW<[Ampere1Write_6cyc_2XY], + (instrs SMMLA, UMMLA, USMMLA)>; +// -- max/min +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +// -- move immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; +// -- multiply +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; +// -- multiply accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; +// -- negation, saturating +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; +// -- reverse bits/bytes +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; +// -- shift +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// -- shift and accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; +// -- shift, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", + "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", + "^UQSHL")>; + +// Vector miscellaneous instructions +// -- duplicate element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>; +// -- duplicate from GPR +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>; +// -- extract narrow +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>; +// -- insert/extract element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; +// -- move FP immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>; +// -- move element to GPR +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>; +// -- move from GPR to any element +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; +// -- table lookup +def : InstRW<[Ampere1Write_2cyc_1XY], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[Ampere1Write_4cyc_2XY], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[Ampere1Write_6cyc_3XY], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[Ampere1Write_8cyc_4XY], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; +// -- transpose +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; +// -- zip/unzip +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; + +} // SchedModel = Ampere1Model diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td new file mode 100644 index 000000000000..8552c07bda56 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td @@ -0,0 +1,25 @@ +//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Ampere Computing processors. +// +//===----------------------------------------------------------------------===// + +// Auxiliary predicates. + +// Check for a LSL shift <= 4 +def AmpereCheapLSL : MCSchedPredicate< + CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3, + CheckShiftBy4]>]>]>>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td index fcda2394bacf..ee7cc1f5095b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td +++ b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td @@ -109,10 +109,7 @@ def ExynosScaledIdxFn : TIIPredicate<"isExynosScaledAddr", def ExynosScaledIdxPred : MCSchedPredicate<ExynosScaledIdxFn>; // Identify FP instructions. -def ExynosFPPred : MCSchedPredicate<CheckAny<[CheckHForm, - CheckSForm, - CheckDForm, - CheckQForm]>>; +def ExynosFPPred : MCSchedPredicate<CheckFpOrNEON>; // Identify 128-bit NEON instructions. def ExynosQFormPred : MCSchedPredicate<CheckQForm>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td index fc13b23b4cf8..4473f3a53845 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td +++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td @@ -53,152 +53,23 @@ let FunctionMapper = "AArch64_AM::getShiftType" in { } // Check for shifting in arithmetic and logic instructions. -foreach I = {0-3, 8} in { +foreach I = {0-4, 8} in { let FunctionMapper = "AArch64_AM::getShiftValue" in def CheckShiftBy#I : CheckImmOperand<3, I>; } // Generic predicates. - -// Identify whether an instruction is the 16-bit NEON form based on its result. -def CheckHForm : CheckAll<[CheckIsRegOperand<0>, - CheckAny<[CheckRegOperand<0, H0>, - CheckRegOperand<0, H1>, - CheckRegOperand<0, H2>, - CheckRegOperand<0, H3>, - CheckRegOperand<0, H4>, - CheckRegOperand<0, H5>, - CheckRegOperand<0, H6>, - CheckRegOperand<0, H7>, - CheckRegOperand<0, H8>, - CheckRegOperand<0, H9>, - CheckRegOperand<0, H10>, - CheckRegOperand<0, H11>, - CheckRegOperand<0, H12>, - CheckRegOperand<0, H13>, - CheckRegOperand<0, H14>, - CheckRegOperand<0, H15>, - CheckRegOperand<0, H16>, - CheckRegOperand<0, H17>, - CheckRegOperand<0, H18>, - CheckRegOperand<0, H19>, - CheckRegOperand<0, H20>, - CheckRegOperand<0, H21>, - CheckRegOperand<0, H22>, - CheckRegOperand<0, H23>, - CheckRegOperand<0, H24>, - CheckRegOperand<0, H25>, - CheckRegOperand<0, H26>, - CheckRegOperand<0, H27>, - CheckRegOperand<0, H28>, - CheckRegOperand<0, H29>, - CheckRegOperand<0, H30>, - CheckRegOperand<0, H31>]>]>; - -// Identify whether an instruction is the 32-bit NEON form based on its result. -def CheckSForm : CheckAll<[CheckIsRegOperand<0>, - CheckAny<[CheckRegOperand<0, S0>, - CheckRegOperand<0, S1>, - CheckRegOperand<0, S2>, - CheckRegOperand<0, S3>, - CheckRegOperand<0, S4>, - CheckRegOperand<0, S5>, - CheckRegOperand<0, S6>, - CheckRegOperand<0, S7>, - CheckRegOperand<0, S8>, - CheckRegOperand<0, S9>, - CheckRegOperand<0, S10>, - CheckRegOperand<0, S11>, - CheckRegOperand<0, S12>, - CheckRegOperand<0, S13>, - CheckRegOperand<0, S14>, - CheckRegOperand<0, S15>, - CheckRegOperand<0, S16>, - CheckRegOperand<0, S17>, - CheckRegOperand<0, S18>, - CheckRegOperand<0, S19>, - CheckRegOperand<0, S20>, - CheckRegOperand<0, S21>, - CheckRegOperand<0, S22>, - CheckRegOperand<0, S23>, - CheckRegOperand<0, S24>, - CheckRegOperand<0, S25>, - CheckRegOperand<0, S26>, - CheckRegOperand<0, S27>, - CheckRegOperand<0, S28>, - CheckRegOperand<0, S29>, - CheckRegOperand<0, S30>, - CheckRegOperand<0, S31>]>]>; - -// Identify whether an instruction is the 64-bit NEON form based on its result. -def CheckDForm : CheckAll<[CheckIsRegOperand<0>, - CheckAny<[CheckRegOperand<0, D0>, - CheckRegOperand<0, D1>, - CheckRegOperand<0, D2>, - CheckRegOperand<0, D3>, - CheckRegOperand<0, D4>, - CheckRegOperand<0, D5>, - CheckRegOperand<0, D6>, - CheckRegOperand<0, D7>, - CheckRegOperand<0, D8>, - CheckRegOperand<0, D9>, - CheckRegOperand<0, D10>, - CheckRegOperand<0, D11>, - CheckRegOperand<0, D12>, - CheckRegOperand<0, D13>, - CheckRegOperand<0, D14>, - CheckRegOperand<0, D15>, - CheckRegOperand<0, D16>, - CheckRegOperand<0, D17>, - CheckRegOperand<0, D18>, - CheckRegOperand<0, D19>, - CheckRegOperand<0, D20>, - CheckRegOperand<0, D21>, - CheckRegOperand<0, D22>, - CheckRegOperand<0, D23>, - CheckRegOperand<0, D24>, - CheckRegOperand<0, D25>, - CheckRegOperand<0, D26>, - CheckRegOperand<0, D27>, - CheckRegOperand<0, D28>, - CheckRegOperand<0, D29>, - CheckRegOperand<0, D30>, - CheckRegOperand<0, D31>]>]>; +// Identify whether an instruction is NEON or floating point +def CheckFpOrNEON : CheckFunctionPredicateWithTII< + "AArch64_MC::isFpOrNEON", + "AArch64InstrInfo::isFpOrNEON" +>; // Identify whether an instruction is the 128-bit NEON form based on its result. -def CheckQForm : CheckAll<[CheckIsRegOperand<0>, - CheckAny<[CheckRegOperand<0, Q0>, - CheckRegOperand<0, Q1>, - CheckRegOperand<0, Q2>, - CheckRegOperand<0, Q3>, - CheckRegOperand<0, Q4>, - CheckRegOperand<0, Q5>, - CheckRegOperand<0, Q6>, - CheckRegOperand<0, Q7>, - CheckRegOperand<0, Q8>, - CheckRegOperand<0, Q9>, - CheckRegOperand<0, Q10>, - CheckRegOperand<0, Q11>, - CheckRegOperand<0, Q12>, - CheckRegOperand<0, Q13>, - CheckRegOperand<0, Q14>, - CheckRegOperand<0, Q15>, - CheckRegOperand<0, Q16>, - CheckRegOperand<0, Q17>, - CheckRegOperand<0, Q18>, - CheckRegOperand<0, Q19>, - CheckRegOperand<0, Q20>, - CheckRegOperand<0, Q21>, - CheckRegOperand<0, Q22>, - CheckRegOperand<0, Q23>, - CheckRegOperand<0, Q24>, - CheckRegOperand<0, Q25>, - CheckRegOperand<0, Q26>, - CheckRegOperand<0, Q27>, - CheckRegOperand<0, Q28>, - CheckRegOperand<0, Q29>, - CheckRegOperand<0, Q30>, - CheckRegOperand<0, Q31>]>]>; +def CheckQForm : CheckFunctionPredicateWithTII< + "AArch64_MC::isQForm", + "AArch64InstrInfo::isQForm" +>; // Identify arithmetic instructions with extend. def IsArithExtOp : CheckOpcode<[ADDWrx, ADDXrx, ADDSWrx, ADDSXrx, diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 77fca22a5f55..6ecfc97a4273 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -25,7 +25,8 @@ def TSV110Model : SchedMachineModel { let CompleteModel = 1; list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, - PAUnsupported.F); + PAUnsupported.F, + SMEUnsupported.F); } // Define each kind of processor resource and number available on the TSV110, diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 893269c1a7ef..677797a6797b 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -91,7 +91,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy( SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, Align Alignment, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); @@ -100,38 +100,6 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size, Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{}); } - - // Check to see if there is a specialized entry-point for memory zeroing. - ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); - ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size); - const char *bzeroName = - (V && V->isZero()) - ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) - : nullptr; - // For small size (< 256), it is not beneficial to use bzero - // instead of memset. - if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) { - const AArch64TargetLowering &TLI = *STI.getTargetLowering(); - - EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); - Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext()); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = Dst; - Entry.Ty = IntPtrTy; - Args.push_back(Entry); - Entry.Node = Size; - Args.push_back(Entry); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(Chain) - .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroName, IntPtr), - std::move(Args)) - .setDiscardResult(); - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - return CallResult.second; - } return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 47fe3bf7dcf5..73f93724d6fc 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -34,7 +34,7 @@ public: SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, - bool isVolatile, + bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const override; SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 566c7a16db23..24816bc9e9bd 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -42,20 +42,23 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/MemoryTaggingSupport.h" #include <cassert> #include <iterator> +#include <memory> #include <utility> using namespace llvm; @@ -63,12 +66,12 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-stack-tagging" static cl::opt<bool> ClMergeInit( - "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, + "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::desc("merge stack variable initializers with tagging when possible")); static cl::opt<bool> ClUseStackSafety("stack-tagging-use-stack-safety", cl::Hidden, - cl::init(true), cl::ZeroOrMore, + cl::init(true), cl::desc("Use Stack Safety analysis results")); static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit", @@ -78,6 +81,12 @@ static cl::opt<unsigned> ClMergeInitSizeLimit("stack-tagging-merge-init-size-limit", cl::init(272), cl::Hidden); +static cl::opt<size_t> ClMaxLifetimes( + "stack-tagging-max-lifetimes-for-alloca", cl::Hidden, cl::init(3), + cl::ReallyHidden, + cl::desc("How many lifetime ends to handle for a single alloca."), + cl::Optional); + static const Align kTagGranuleSize = Align(16); namespace { @@ -283,15 +292,6 @@ public: }; class AArch64StackTagging : public FunctionPass { - struct AllocaInfo { - AllocaInst *AI; - TrackingVH<Instruction> OldAI; // Track through RAUW to replace debug uses. - SmallVector<IntrinsicInst *, 2> LifetimeStart; - SmallVector<IntrinsicInst *, 2> LifetimeEnd; - SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics; - int Tag; // -1 for non-tagged allocations - }; - const bool MergeInit; const bool UseStackSafety; @@ -307,7 +307,6 @@ public: } bool isInterestingAlloca(const AllocaInst &AI); - void alignAndPadAlloca(AllocaInfo &Info); void tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr, uint64_t Size); @@ -316,9 +315,9 @@ public: Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr, uint64_t Size, InitializerBuilder &IB); - Instruction * - insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas, - const DominatorTree *DT); + Instruction *insertBaseTaggedPointer( + const MapVector<AllocaInst *, memtag::AllocaInfo> &Allocas, + const DominatorTree *DT); bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "AArch64 Stack Tagging"; } @@ -419,7 +418,7 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { bool IsInteresting = AI.getAllocatedType()->isSized() && AI.isStaticAlloca() && // alloca() may be called with 0 size, ignore it. - AI.getAllocationSizeInBits(*DL).getValue() > 0 && + *AI.getAllocationSizeInBits(*DL) > 0 && // inalloca allocas are not treated as static, and we don't want // dynamic alloca instrumentation for them as well. !AI.isUsedWithInAlloca() && @@ -460,15 +459,13 @@ void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore, } Instruction *AArch64StackTagging::insertBaseTaggedPointer( - const MapVector<AllocaInst *, AllocaInfo> &Allocas, + const MapVector<AllocaInst *, memtag::AllocaInfo> &AllocasToInstrument, const DominatorTree *DT) { BasicBlock *PrologueBB = nullptr; // Try sinking IRG as deep as possible to avoid hurting shrink wrap. - for (auto &I : Allocas) { - const AllocaInfo &Info = I.second; + for (auto &I : AllocasToInstrument) { + const memtag::AllocaInfo &Info = I.second; AllocaInst *AI = Info.AI; - if (Info.Tag < 0) - continue; if (!PrologueBB) { PrologueBB = AI->getParent(); continue; @@ -486,40 +483,6 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( return Base; } -void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { - const Align NewAlignment = - max(MaybeAlign(Info.AI->getAlign()), kTagGranuleSize); - Info.AI->setAlignment(NewAlignment); - - uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; - uint64_t AlignedSize = alignTo(Size, kTagGranuleSize); - if (Size == AlignedSize) - return; - - // Add padding to the alloca. - Type *AllocatedType = - Info.AI->isArrayAllocation() - ? ArrayType::get( - Info.AI->getAllocatedType(), - cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue()) - : Info.AI->getAllocatedType(); - Type *PaddingType = - ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size); - Type *TypeWithPadding = StructType::get(AllocatedType, PaddingType); - auto *NewAI = new AllocaInst( - TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI); - NewAI->takeName(Info.AI); - NewAI->setAlignment(Info.AI->getAlign()); - NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca()); - NewAI->setSwiftError(Info.AI->isSwiftError()); - NewAI->copyMetadata(*Info.AI); - - auto *NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI); - Info.AI->replaceAllUsesWith(NewPtr); - Info.AI->eraseFromParent(); - Info.AI = NewAI; -} - // FIXME: check for MTE extension bool AArch64StackTagging::runOnFunction(Function &Fn) { if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag)) @@ -532,76 +495,21 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (MergeInit) AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order - SmallVector<Instruction *, 8> RetVec; - SmallVector<Instruction *, 4> UnrecognizedLifetimes; - - for (auto &BB : *F) { - for (Instruction &I : BB) { - if (auto *AI = dyn_cast<AllocaInst>(&I)) { - Allocas[AI].AI = AI; - Allocas[AI].OldAI = AI; - continue; - } - - if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) { - for (Value *V : DVI->location_ops()) - if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) - if (Allocas[AI].DbgVariableIntrinsics.empty() || - Allocas[AI].DbgVariableIntrinsics.back() != DVI) - Allocas[AI].DbgVariableIntrinsics.push_back(DVI); - continue; - } - - auto *II = dyn_cast<IntrinsicInst>(&I); - if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end)) { - AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); - if (!AI) { - UnrecognizedLifetimes.push_back(&I); - continue; - } - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - Allocas[AI].LifetimeStart.push_back(II); - else - Allocas[AI].LifetimeEnd.push_back(II); - } - - if (isa<ReturnInst, ResumeInst, CleanupReturnInst>(&I)) - RetVec.push_back(&I); - } - } + memtag::StackInfoBuilder SIB( + [this](const AllocaInst &AI) { return isInterestingAlloca(AI); }); + for (Instruction &I : instructions(F)) + SIB.visit(I); + memtag::StackInfo &SInfo = SIB.get(); - if (Allocas.empty()) + if (SInfo.AllocasToInstrument.empty()) return false; - int NextTag = 0; - int NumInterestingAllocas = 0; - for (auto &I : Allocas) { - AllocaInfo &Info = I.second; - assert(Info.AI); - - if (!isInterestingAlloca(*Info.AI)) { - Info.Tag = -1; - continue; - } - - alignAndPadAlloca(Info); - NumInterestingAllocas++; - Info.Tag = NextTag; - NextTag = (NextTag + 1) % 16; - } - - if (NumInterestingAllocas == 0) - return true; - std::unique_ptr<DominatorTree> DeleteDT; DominatorTree *DT = nullptr; if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) DT = &P->getDomTree(); - if (DT == nullptr && (NumInterestingAllocas > 1 || - !F->hasFnAttribute(Attribute::OptimizeNone))) { + if (DT == nullptr) { DeleteDT = std::make_unique<DominatorTree>(*F); DT = DeleteDT.get(); } @@ -611,38 +519,57 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>()) PDT = &P->getPostDomTree(); - if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) { + if (PDT == nullptr) { DeletePDT = std::make_unique<PostDominatorTree>(*F); PDT = DeletePDT.get(); } + std::unique_ptr<LoopInfo> DeleteLI; + LoopInfo *LI = nullptr; + if (auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>()) { + LI = &LIWP->getLoopInfo(); + } else { + DeleteLI = std::make_unique<LoopInfo>(*DT); + LI = DeleteLI.get(); + } + SetTagFunc = Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); - Instruction *Base = insertBaseTaggedPointer(Allocas, DT); + Instruction *Base = insertBaseTaggedPointer(SInfo.AllocasToInstrument, DT); - for (auto &I : Allocas) { - const AllocaInfo &Info = I.second; + int NextTag = 0; + for (auto &I : SInfo.AllocasToInstrument) { + memtag::AllocaInfo &Info = I.second; + assert(Info.AI && isInterestingAlloca(*Info.AI)); + TrackingVH<Instruction> OldAI = Info.AI; + memtag::alignAndPadAlloca(Info, kTagGranuleSize); AllocaInst *AI = Info.AI; - if (Info.Tag < 0) - continue; - + int Tag = NextTag; + NextTag = (NextTag + 1) % 16; // Replace alloca with tagp(alloca). IRBuilder<> IRB(Info.AI->getNextNode()); Function *TagP = Intrinsic::getDeclaration( F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()}); Instruction *TagPCall = IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base, - ConstantInt::get(IRB.getInt64Ty(), Info.Tag)}); + ConstantInt::get(IRB.getInt64Ty(), Tag)}); if (Info.AI->hasName()) TagPCall->setName(Info.AI->getName() + ".tag"); Info.AI->replaceAllUsesWith(TagPCall); TagPCall->setOperand(0, Info.AI); - if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 && - Info.LifetimeEnd.size() == 1) { + // Calls to functions that may return twice (e.g. setjmp) confuse the + // postdominator analysis, and will leave us to keep memory tagged after + // function return. Work around this by always untagging at every return + // statement if return_twice functions are called. + bool StandardLifetime = + SInfo.UnrecognizedLifetimes.empty() && + memtag::isStandardLifetime(Info.LifetimeStart, Info.LifetimeEnd, DT, LI, + ClMaxLifetimes) && + !SInfo.CallsReturnTwice; + if (StandardLifetime) { IntrinsicInst *Start = Info.LifetimeStart[0]; - IntrinsicInst *End = Info.LifetimeEnd[0]; uint64_t Size = cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); Size = alignTo(Size, kTagGranuleSize); @@ -650,14 +577,16 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { auto TagEnd = [&](Instruction *Node) { untagAlloca(AI, Node, Size); }; if (!DT || !PDT || - !forAllReachableExits(*DT, *PDT, Start, Info.LifetimeEnd, RetVec, - TagEnd)) - End->eraseFromParent(); + !memtag::forAllReachableExits(*DT, *PDT, *LI, Start, Info.LifetimeEnd, + SInfo.RetVec, TagEnd)) { + for (auto *End : Info.LifetimeEnd) + End->eraseFromParent(); + } } else { - uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; + uint64_t Size = *Info.AI->getAllocationSizeInBits(*DL) / 8; Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy()); tagAlloca(AI, &*IRB.GetInsertPoint(), Ptr, Size); - for (auto &RI : RetVec) { + for (auto &RI : SInfo.RetVec) { untagAlloca(AI, RI, Size); } // We may have inserted tag/untag outside of any lifetime interval. @@ -670,12 +599,12 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { // Fixup debug intrinsics to point to the new alloca. for (auto DVI : Info.DbgVariableIntrinsics) - DVI->replaceVariableLocationOp(Info.OldAI, Info.AI); + DVI->replaceVariableLocationOp(OldAI, Info.AI); } // If we have instrumented at least one alloca, all unrecognized lifetime - // instrinsics have to go. - for (auto &I : UnrecognizedLifetimes) + // intrinsics have to go. + for (auto &I : SInfo.UnrecognizedLifetimes) I->eraseFromParent(); return true; diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp index cae6d65bed2d..7e91dc1b6385 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -50,7 +50,6 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt( static cl::opt<bool> ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true), - cl::ZeroOrMore, cl::desc("Apply first slot optimization for stack tagging " "(eliminate ADDG Rt, Rn, 0, 0).")); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 8a7e20237271..15005304383d 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -21,6 +21,7 @@ #include "GISel/AArch64RegisterBankInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/AArch64TargetParser.h" @@ -51,6 +52,16 @@ static cl::opt<bool> static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen.")); +static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost( + "aarch64-insert-extract-base-cost", + cl::desc("Base cost of vector insert/extract element"), cl::Hidden); + +unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { + if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) + return OverrideVectorInsertExtractBaseCost; + return VectorInsertExtractBaseCost; +} + AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( StringRef FS, StringRef CPUString, StringRef TuneCPUString) { // Determine default and user-specified characteristics @@ -78,14 +89,17 @@ void AArch64Subtarget::initializeProperties() { CacheLineSize = 64; break; case CortexA35: - break; case CortexA53: case CortexA55: PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 4; + MaxBytesForLoopAlignment = 8; break; case CortexA57: MaxInterleaveFactor = 4; PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 4; + MaxBytesForLoopAlignment = 8; break; case CortexA65: PrefFunctionLogAlignment = 3; @@ -93,6 +107,10 @@ void AArch64Subtarget::initializeProperties() { case CortexA72: case CortexA73: case CortexA75: + PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 4; + MaxBytesForLoopAlignment = 8; + break; case CortexA76: case CortexA77: case CortexA78: @@ -101,12 +119,21 @@ void AArch64Subtarget::initializeProperties() { case CortexX1: case CortexX1C: PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 5; + MaxBytesForLoopAlignment = 16; break; case CortexA510: + PrefFunctionLogAlignment = 4; + VScaleForTuning = 1; + PrefLoopLogAlignment = 4; + MaxBytesForLoopAlignment = 8; + break; case CortexA710: case CortexX2: PrefFunctionLogAlignment = 4; VScaleForTuning = 1; + PrefLoopLogAlignment = 5; + MaxBytesForLoopAlignment = 16; break; case A64FX: CacheLineSize = 256; @@ -221,6 +248,12 @@ void AArch64Subtarget::initializeProperties() { // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; + case Ampere1: + CacheLineSize = 64; + PrefFunctionLogAlignment = 6; + PrefLoopLogAlignment = 6; + MaxInterleaveFactor = 4; + break; } } @@ -352,6 +385,8 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const { if (!UseAddressTopByteIgnored) return false; + if (TargetTriple.isDriverKit()) + return true; if (TargetTriple.isiOS()) { return TargetTriple.getiOSVersion() >= VersionTuple(8); } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 7b2bbad30f85..c92e3e44de31 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -22,7 +22,7 @@ #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/RegisterBankInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" #include <string> @@ -40,6 +40,7 @@ public: enum ARMProcFamilyEnum : uint8_t { Others, A64FX, + Ampere1, AppleA7, AppleA10, AppleA11, @@ -87,191 +88,14 @@ protected: /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily = Others; - bool HasV8_0aOps = false; - bool HasV8_1aOps = false; - bool HasV8_2aOps = false; - bool HasV8_3aOps = false; - bool HasV8_4aOps = false; - bool HasV8_5aOps = false; - bool HasV8_6aOps = false; - bool HasV8_7aOps = false; - bool HasV8_8aOps = false; - bool HasV9_0aOps = false; - bool HasV9_1aOps = false; - bool HasV9_2aOps = false; - bool HasV9_3aOps = false; - bool HasV8_0rOps = false; - - bool HasCONTEXTIDREL2 = false; - bool HasEL2VMSA = false; - bool HasEL3 = false; - bool HasFPARMv8 = false; - bool HasNEON = false; - bool HasCrypto = false; - bool HasDotProd = false; - bool HasCRC = false; - bool HasLSE = false; - bool HasLSE2 = false; - bool HasRAS = false; - bool HasRDM = false; - bool HasPerfMon = false; - bool HasFullFP16 = false; - bool HasFP16FML = false; - bool HasSPE = false; - - bool FixCortexA53_835769 = false; - - // ARMv8.1 extensions - bool HasVH = false; - bool HasPAN = false; - bool HasLOR = false; - - // ARMv8.2 extensions - bool HasPsUAO = false; - bool HasPAN_RWV = false; - bool HasCCPP = false; - - // SVE extensions - bool HasSVE = false; - bool UseExperimentalZeroingPseudos = false; - bool UseScalarIncVL = false; - - // Armv8.2 Crypto extensions - bool HasSM4 = false; - bool HasSHA3 = false; - bool HasSHA2 = false; - bool HasAES = false; - - // ARMv8.3 extensions - bool HasPAuth = false; - bool HasJS = false; - bool HasCCIDX = false; - bool HasComplxNum = false; - - // ARMv8.4 extensions - bool HasNV = false; - bool HasMPAM = false; - bool HasDIT = false; - bool HasTRACEV8_4 = false; - bool HasAM = false; - bool HasSEL2 = false; - bool HasTLB_RMI = false; - bool HasFlagM = false; - bool HasRCPC_IMMO = false; - - bool HasLSLFast = false; - bool HasRCPC = false; - bool HasAggressiveFMA = false; - - // Armv8.5-A Extensions - bool HasAlternativeNZCV = false; - bool HasFRInt3264 = false; - bool HasSpecRestrict = false; - bool HasSSBS = false; - bool HasSB = false; - bool HasPredRes = false; - bool HasCCDP = false; - bool HasBTI = false; - bool HasRandGen = false; - bool HasMTE = false; - bool HasTME = false; - - // Armv8.6-A Extensions - bool HasBF16 = false; - bool HasMatMulInt8 = false; - bool HasMatMulFP32 = false; - bool HasMatMulFP64 = false; - bool HasAMVS = false; - bool HasFineGrainedTraps = false; - bool HasEnhancedCounterVirtualization = false; - - // Armv8.7-A Extensions - bool HasXS = false; - bool HasWFxT = false; - bool HasHCX = false; - bool HasLS64 = false; - - // Armv8.8-A Extensions - bool HasHBC = false; - bool HasMOPS = false; - - // Arm SVE2 extensions - bool HasSVE2 = false; - bool HasSVE2AES = false; - bool HasSVE2SM4 = false; - bool HasSVE2SHA3 = false; - bool HasSVE2BitPerm = false; - - // Armv9-A Extensions - bool HasRME = false; - - // Arm Scalable Matrix Extension (SME) - bool HasSME = false; - bool HasSMEF64 = false; - bool HasSMEI64 = false; - bool HasStreamingSVE = false; - - // AppleA7 system register. - bool HasAppleA7SysReg = false; - - // Future architecture extensions. - bool HasETE = false; - bool HasTRBE = false; - bool HasBRBE = false; - bool HasSPE_EEF = false; - - // HasZeroCycleRegMove - Has zero-cycle register mov instructions. - bool HasZeroCycleRegMove = false; - - // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. - bool HasZeroCycleZeroing = false; - bool HasZeroCycleZeroingGP = false; - bool HasZeroCycleZeroingFPWorkaround = false; - - // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0". - // as movi is more efficient across all cores. Newer cores can eliminate - // fmovs early and there is no difference with movi, but this not true for - // all implementations. - bool HasZeroCycleZeroingFP = true; - - // StrictAlign - Disallow unaligned memory accesses. - bool StrictAlign = false; - - // NegativeImmediates - transform instructions with negative immediates - bool NegativeImmediates = true; - // Enable 64-bit vectorization in SLP. unsigned MinVectorRegisterBitWidth = 64; - bool OutlineAtomics = false; - bool PredictableSelectIsExpensive = false; - bool BalanceFPOps = false; - bool CustomAsCheapAsMove = false; - bool ExynosAsCheapAsMove = false; - bool UsePostRAScheduler = false; - bool Misaligned128StoreIsSlow = false; - bool Paired128IsSlow = false; - bool STRQroIsSlow = false; - bool UseAlternateSExtLoadCVTF32Pattern = false; - bool HasArithmeticBccFusion = false; - bool HasArithmeticCbzFusion = false; - bool HasCmpBccFusion = false; - bool HasFuseAddress = false; - bool HasFuseAES = false; - bool HasFuseArithmeticLogic = false; - bool HasFuseCCSelect = false; - bool HasFuseCryptoEOR = false; - bool HasFuseLiterals = false; - bool DisableLatencySchedHeuristic = false; - bool UseRSqrt = false; - bool Force32BitJumpTables = false; - bool UseEL1ForTP = false; - bool UseEL2ForTP = false; - bool UseEL3ForTP = false; - bool AllowTaggedGlobals = false; - bool HardenSlsRetBr = false; - bool HardenSlsBlr = false; - bool HardenSlsNoComdat = false; +// Bool members corresponding to the SubtargetFeatures defined in tablegen +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool ATTRIBUTE = DEFAULT; +#include "AArch64GenSubtargetInfo.inc" + uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -282,7 +106,6 @@ protected: unsigned PrefLoopLogAlignment = 0; unsigned MaxBytesForLoopAlignment = 0; unsigned MaxJumpTableSize = 0; - unsigned WideningBaseCost = 0; // ReserveXRegister[i] - X#i is not available as a general purpose register. BitVector ReserveXRegister; @@ -331,6 +154,11 @@ public: unsigned MinSVEVectorSizeInBitsOverride = 0, unsigned MaxSVEVectorSizeInBitsOverride = 0); +// Getters for SubtargetFeatures defined in tablegen +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool GETTER() const { return ATTRIBUTE; } +#include "AArch64GenSubtargetInfo.inc" + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } @@ -351,9 +179,7 @@ public: const RegisterBankInfo *getRegBankInfo() const override; const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } - bool enablePostRAScheduler() const override { - return UsePostRAScheduler; - } + bool enablePostRAScheduler() const override { return usePostRAScheduler(); } /// Returns ARM processor family. /// Avoid this function! CPU specifics should be kept local to this class @@ -363,30 +189,6 @@ public: return ARMProcFamily; } - bool hasV8_0aOps() const { return HasV8_0aOps; } - bool hasV8_1aOps() const { return HasV8_1aOps; } - bool hasV8_2aOps() const { return HasV8_2aOps; } - bool hasV8_3aOps() const { return HasV8_3aOps; } - bool hasV8_4aOps() const { return HasV8_4aOps; } - bool hasV8_5aOps() const { return HasV8_5aOps; } - bool hasV9_0aOps() const { return HasV9_0aOps; } - bool hasV9_1aOps() const { return HasV9_1aOps; } - bool hasV9_2aOps() const { return HasV9_2aOps; } - bool hasV9_3aOps() const { return HasV9_3aOps; } - bool hasV8_0rOps() const { return HasV8_0rOps; } - - bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } - - bool hasZeroCycleZeroingGP() const { return HasZeroCycleZeroingGP; } - - bool hasZeroCycleZeroingFP() const { return HasZeroCycleZeroingFP; } - - bool hasZeroCycleZeroingFPWorkaround() const { - return HasZeroCycleZeroingFPWorkaround; - } - - bool requiresStrictAlign() const { return StrictAlign; } - bool isXRaySupported() const override { return true; } unsigned getMinVectorRegisterBitWidth() const { @@ -399,63 +201,16 @@ public: return CustomCallSavedXRegs[i]; } bool hasCustomCallingConv() const { return CustomCallSavedXRegs.any(); } - bool hasFPARMv8() const { return HasFPARMv8; } - bool hasNEON() const { return HasNEON; } - bool hasCrypto() const { return HasCrypto; } - bool hasDotProd() const { return HasDotProd; } - bool hasCRC() const { return HasCRC; } - bool hasLSE() const { return HasLSE; } - bool hasLSE2() const { return HasLSE2; } - bool hasRAS() const { return HasRAS; } - bool hasRDM() const { return HasRDM; } - bool hasSM4() const { return HasSM4; } - bool hasSHA3() const { return HasSHA3; } - bool hasSHA2() const { return HasSHA2; } - bool hasAES() const { return HasAES; } - bool hasCONTEXTIDREL2() const { return HasCONTEXTIDREL2; } - bool balanceFPOps() const { return BalanceFPOps; } - bool predictableSelectIsExpensive() const { - return PredictableSelectIsExpensive; - } - bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } - bool hasExynosCheapAsMoveHandling() const { return ExynosAsCheapAsMove; } - bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } - bool isPaired128Slow() const { return Paired128IsSlow; } - bool isSTRQroSlow() const { return STRQroIsSlow; } - bool useAlternateSExtLoadCVTF32Pattern() const { - return UseAlternateSExtLoadCVTF32Pattern; - } - bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } - bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } - bool hasCmpBccFusion() const { return HasCmpBccFusion; } - bool hasFuseAddress() const { return HasFuseAddress; } - bool hasFuseAES() const { return HasFuseAES; } - bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; } - bool hasFuseCCSelect() const { return HasFuseCCSelect; } - bool hasFuseCryptoEOR() const { return HasFuseCryptoEOR; } - bool hasFuseLiterals() const { return HasFuseLiterals; } /// Return true if the CPU supports any kind of instruction fusion. bool hasFusion() const { return hasArithmeticBccFusion() || hasArithmeticCbzFusion() || - hasFuseAES() || hasFuseArithmeticLogic() || - hasFuseCCSelect() || hasFuseLiterals(); + hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() || + hasFuseAdrpAdd() || hasFuseLiterals(); } - bool hardenSlsRetBr() const { return HardenSlsRetBr; } - bool hardenSlsBlr() const { return HardenSlsBlr; } - bool hardenSlsNoComdat() const { return HardenSlsNoComdat; } - - bool useEL1ForTP() const { return UseEL1ForTP; } - bool useEL2ForTP() const { return UseEL2ForTP; } - bool useEL3ForTP() const { return UseEL3ForTP; } - - bool useRSqrt() const { return UseRSqrt; } - bool force32BitJumpTables() const { return Force32BitJumpTables; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } - unsigned getVectorInsertExtractBaseCost() const { - return VectorInsertExtractBaseCost; - } + unsigned getVectorInsertExtractBaseCost() const; unsigned getCacheLineSize() const override { return CacheLineSize; } unsigned getPrefetchDistance() const override { return PrefetchDistance; } unsigned getMinPrefetchStride(unsigned NumMemAccesses, @@ -478,60 +233,10 @@ public: unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } - unsigned getWideningBaseCost() const { return WideningBaseCost; } - - bool useExperimentalZeroingPseudos() const { - return UseExperimentalZeroingPseudos; - } - - bool useScalarIncVL() const { return UseScalarIncVL; } - /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; - bool hasPerfMon() const { return HasPerfMon; } - bool hasFullFP16() const { return HasFullFP16; } - bool hasFP16FML() const { return HasFP16FML; } - bool hasSPE() const { return HasSPE; } - bool hasLSLFast() const { return HasLSLFast; } - bool hasSVE() const { return HasSVE; } - bool hasSVE2() const { return HasSVE2; } - bool hasRCPC() const { return HasRCPC; } - bool hasAggressiveFMA() const { return HasAggressiveFMA; } - bool hasAlternativeNZCV() const { return HasAlternativeNZCV; } - bool hasFRInt3264() const { return HasFRInt3264; } - bool hasSpecRestrict() const { return HasSpecRestrict; } - bool hasSSBS() const { return HasSSBS; } - bool hasSB() const { return HasSB; } - bool hasPredRes() const { return HasPredRes; } - bool hasCCDP() const { return HasCCDP; } - bool hasBTI() const { return HasBTI; } - bool hasRandGen() const { return HasRandGen; } - bool hasMTE() const { return HasMTE; } - bool hasTME() const { return HasTME; } - // Arm SVE2 extensions - bool hasSVE2AES() const { return HasSVE2AES; } - bool hasSVE2SM4() const { return HasSVE2SM4; } - bool hasSVE2SHA3() const { return HasSVE2SHA3; } - bool hasSVE2BitPerm() const { return HasSVE2BitPerm; } - bool hasMatMulInt8() const { return HasMatMulInt8; } - bool hasMatMulFP32() const { return HasMatMulFP32; } - bool hasMatMulFP64() const { return HasMatMulFP64; } - - // Armv8.6-A Extensions - bool hasBF16() const { return HasBF16; } - bool hasFineGrainedTraps() const { return HasFineGrainedTraps; } - bool hasEnhancedCounterVirtualization() const { - return HasEnhancedCounterVirtualization; - } - - // Arm Scalable Matrix Extension (SME) - bool hasSME() const { return HasSME; } - bool hasSMEF64() const { return HasSMEF64; } - bool hasSMEI64() const { return HasSMEI64; } - bool hasStreamingSVE() const { return HasStreamingSVE; } - bool isLittleEndian() const { return IsLittle; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } @@ -552,42 +257,6 @@ public: bool useAA() const override; - bool outlineAtomics() const { return OutlineAtomics; } - - bool hasVH() const { return HasVH; } - bool hasPAN() const { return HasPAN; } - bool hasLOR() const { return HasLOR; } - - bool hasPsUAO() const { return HasPsUAO; } - bool hasPAN_RWV() const { return HasPAN_RWV; } - bool hasCCPP() const { return HasCCPP; } - - bool hasPAuth() const { return HasPAuth; } - bool hasJS() const { return HasJS; } - bool hasCCIDX() const { return HasCCIDX; } - bool hasComplxNum() const { return HasComplxNum; } - - bool hasNV() const { return HasNV; } - bool hasMPAM() const { return HasMPAM; } - bool hasDIT() const { return HasDIT; } - bool hasTRACEV8_4() const { return HasTRACEV8_4; } - bool hasAM() const { return HasAM; } - bool hasAMVS() const { return HasAMVS; } - bool hasXS() const { return HasXS; } - bool hasWFxT() const { return HasWFxT; } - bool hasHCX() const { return HasHCX; } - bool hasLS64() const { return HasLS64; } - bool hasSEL2() const { return HasSEL2; } - bool hasTLB_RMI() const { return HasTLB_RMI; } - bool hasFlagM() const { return HasFlagM; } - bool hasRCPC_IMMO() const { return HasRCPC_IMMO; } - bool hasEL2VMSA() const { return HasEL2VMSA; } - bool hasEL3() const { return HasEL3; } - bool hasHBC() const { return HasHBC; } - bool hasMOPS() const { return HasMOPS; } - - bool fixCortexA53_835769() const { return FixCortexA53_835769; } - bool addrSinkUsingGEPs() const override { // Keeping GEPs inbounds is important for exploiting AArch64 // addressing-modes in ILP32 mode. @@ -623,8 +292,6 @@ public: bool enableEarlyIfConversion() const override; - bool enableAdvancedRASplitCost() const override { return false; } - std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override; bool isCallingConvWin64(CallingConv::ID CC) const { diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index cce5813fe6e9..f3788175c48d 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -18,23 +18,23 @@ include "llvm/TableGen/SearchableTable.td" //===----------------------------------------------------------------------===// def HasCCPP : Predicate<"Subtarget->hasCCPP()">, - AssemblerPredicate<(all_of FeatureCCPP), "ccpp">; + AssemblerPredicateWithAll<(all_of FeatureCCPP), "ccpp">; def HasPAN : Predicate<"Subtarget->hasPAN()">, - AssemblerPredicate<(all_of FeaturePAN), + AssemblerPredicateWithAll<(all_of FeaturePAN), "ARM v8.1 Privileged Access-Never extension">; def HasPsUAO : Predicate<"Subtarget->hasPsUAO()">, - AssemblerPredicate<(all_of FeaturePsUAO), + AssemblerPredicateWithAll<(all_of FeaturePsUAO), "ARM v8.2 UAO PState extension (psuao)">; def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">, - AssemblerPredicate<(all_of FeaturePAN_RWV), + AssemblerPredicateWithAll<(all_of FeaturePAN_RWV), "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">; def HasCONTEXTIDREL2 : Predicate<"Subtarget->hasCONTEXTIDREL2()">, - AssemblerPredicate<(all_of FeatureCONTEXTIDREL2), + AssemblerPredicateWithAll<(all_of FeatureCONTEXTIDREL2), "Target contains CONTEXTIDR_EL2 RW operand">; //===----------------------------------------------------------------------===// @@ -631,6 +631,7 @@ def : ROSysReg<"OSLSR_EL1", 0b10, 0b000, 0b0001, 0b0001, 0b100>; def : ROSysReg<"DBGAUTHSTATUS_EL1", 0b10, 0b000, 0b0111, 0b1110, 0b110>; def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>; def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>; +def : ROSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>; def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>; def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>; @@ -977,7 +978,6 @@ def : RWSysReg<"PMUSERENR_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b000>; def : RWSysReg<"PMINTENSET_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b001>; def : RWSysReg<"PMINTENCLR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b010>; def : RWSysReg<"PMOVSSET_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b011>; -def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>; def : RWSysReg<"MAIR_EL1", 0b11, 0b000, 0b1010, 0b0010, 0b000>; def : RWSysReg<"MAIR_EL2", 0b11, 0b100, 0b1010, 0b0010, 0b000>; def : RWSysReg<"MAIR_EL3", 0b11, 0b110, 0b1010, 0b0010, 0b000>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 4af28fc070dd..3f9795f5198b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -12,6 +12,7 @@ #include "AArch64TargetMachine.h" #include "AArch64.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64MachineScheduler.h" #include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" #include "AArch64TargetObjectFile.h" @@ -21,7 +22,9 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/CFIFixup.h" #include "llvm/CodeGen/CSEConfigBase.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" @@ -31,6 +34,7 @@ #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -59,6 +63,11 @@ static cl::opt<bool> cl::desc("Enable the conditional branch tuning pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnableAArch64CopyPropagation( + "aarch64-enable-copy-propagation", + cl::desc("Enable the copy propagation with AArch64 copy instr"), + cl::init(true), cl::Hidden); + static cl::opt<bool> EnableMCR("aarch64-enable-mcr", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); @@ -265,7 +274,7 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, // On ELF platforms the default static relocation model has a smart enough // linker to cope with referencing external symbols defined in a shared // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. - if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC) + if (!RM || *RM == Reloc::DynamicNoPIC) return Reloc::Static; return *RM; } @@ -354,6 +363,10 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, // AArch64 supports the debug entry values. setSupportsDebugEntryValues(true); + + // AArch64 supports fixing up the DWARF unwind information. + if (!getMCAsmInfo()->usesWindowsCFI()) + setCFIFixup(true); } AArch64TargetMachine::~AArch64TargetMachine() = default; @@ -379,7 +392,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { if (VScaleRangeAttr.isValid()) { Optional<unsigned> VScaleMax = VScaleRangeAttr.getVScaleRangeMax(); MinSVEVectorSize = VScaleRangeAttr.getVScaleRangeMin() * 128; - MaxSVEVectorSize = VScaleMax ? VScaleMax.getValue() * 128 : 0; + MaxSVEVectorSize = VScaleMax ? *VScaleMax * 128 : 0; } else { MinSVEVectorSize = SVEVectorBitsMinOpt; MaxSVEVectorSize = SVEVectorBitsMaxOpt; @@ -468,15 +481,17 @@ public: ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>(); + ScheduleDAGMI *DAG = + new ScheduleDAGMI(C, std::make_unique<AArch64PostRASchedStrategy>(C), + /* RemoveKillFlags=*/true); if (ST.hasFusion()) { // Run the Macro Fusion after RA again since literals are expanded from // pseudos then (v. addPreSched2()). - ScheduleDAGMI *DAG = createGenericSchedPostRA(C); DAG->addMutation(createAArch64MacroFusionDAGMutation()); return DAG; } - return nullptr; + return DAG; } void addIRPasses() override; @@ -504,7 +519,7 @@ public: } // end anonymous namespace TargetTransformInfo -AArch64TargetMachine::getTargetTransformInfo(const Function &F) { +AArch64TargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(AArch64TTIImpl(this, F)); } @@ -531,6 +546,7 @@ void AArch64PassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(true) + .convertSwitchRangeToICmp(true) .convertSwitchToLookupTable(true) .needCanonicalLoops(false) .hoistCommonInsts(true) @@ -574,6 +590,9 @@ void AArch64PassConfig::addIRPasses() { // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) addPass(createCFGuardCheckPass()); + + if (TM->Options.JMCInstrument) + addPass(createJMCInstrumenterPass()); } // Pass Pipeline Configuration @@ -759,6 +778,10 @@ void AArch64PassConfig::addPreEmitPass() { if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt) addPass(createAArch64LoadStoreOptimizationPass()); + if (TM->getOptLevel() >= CodeGenOpt::Aggressive && + EnableAArch64CopyPropagation) + addPass(createMachineCopyPropagationPass(true)); + addPass(createAArch64A53Fix835769()); if (EnableBranchTargets) @@ -804,8 +827,7 @@ AArch64TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { bool AArch64TargetMachine::parseMachineFunctionInfo( const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const { - const auto &YamlMFI = - reinterpret_cast<const yaml::AArch64FunctionInfo &>(MFI); + const auto &YamlMFI = static_cast<const yaml::AArch64FunctionInfo &>(MFI); MachineFunction &MF = PFS.MF; MF.getInfo<AArch64FunctionInfo>()->initializeBaseYamlFields(YamlMFI); return false; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 7d314bce99b1..beb109502ff9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -41,7 +41,7 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetTransformInfo getTargetTransformInfo(const Function &F) override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; TargetLoweringObjectFile* getObjFileLowering() const override { return TLOF.get(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b2ffdf949d8b..41c7a8c5042f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -8,6 +8,7 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" +#include "AArch64PerfectShuffle.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" @@ -15,8 +16,8 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" @@ -50,6 +51,12 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, return (CallerBits & CalleeBits) == CalleeBits; } +bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( + TargetTransformInfo::RegisterKind K) const { + assert(K != TargetTransformInfo::RGK_Scalar); + return K == TargetTransformInfo::RGK_FixedWidthVector; +} + /// Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. @@ -370,6 +377,49 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return Entry->Cost; break; } + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: { + if (ICA.getArgTypes().empty()) + break; + bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; + auto LT = TLI->getTypeLegalizationCost(DL, ICA.getArgTypes()[0]); + EVT MTy = TLI->getValueType(DL, RetTy); + // Check for the legal types, which are where the size of the input and the + // output are the same, or we are using cvt f64->i32 or f32->i64. + if ((LT.second == MVT::f32 || LT.second == MVT::f64 || + LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || + LT.second == MVT::v2f64) && + (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || + (LT.second == MVT::f64 && MTy == MVT::i32) || + (LT.second == MVT::f32 && MTy == MVT::i64))) + return LT.first; + // Similarly for fp16 sizes + if (ST->hasFullFP16() && + ((LT.second == MVT::f16 && MTy == MVT::i32) || + ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && + (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) + return LT.first; + + // Otherwise we use a legal convert followed by a min+max + if ((LT.second.getScalarType() == MVT::f32 || + LT.second.getScalarType() == MVT::f64 || + (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && + LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { + Type *LegalTy = + Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); + if (LT.second.isVector()) + LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); + InstructionCost Cost = 1; + IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, + LegalTy, {LegalTy, LegalTy}); + Cost += getIntrinsicInstrCost(Attrs1, CostKind); + IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, + LegalTy, {LegalTy, LegalTy}); + Cost += getIntrinsicInstrCost(Attrs2, CostKind); + return LT.first * Cost; + } + break; + } default: break; } @@ -525,6 +575,14 @@ static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, return IC.replaceInstUsesWith(II, EarliestReplacement); } +static Optional<Instruction *> instCombineSVESel(InstCombiner &IC, + IntrinsicInst &II) { + IRBuilder<> Builder(&II); + auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), + II.getOperand(2)); + return IC.replaceInstUsesWith(II, Select); +} + static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II) { IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); @@ -594,8 +652,7 @@ static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, return None; auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); - if (!VecIns || - VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) + if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) return None; // Where the vector insert is a fixed constant vector insert into undef at @@ -862,12 +919,14 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { if (isAllActivePredicate(Pred)) { LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); + Load->copyMetadata(II); return IC.replaceInstUsesWith(II, Load); } CallInst *MaskedLoad = Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), Pred, ConstantAggregateZero::get(VecTy)); + MaskedLoad->copyMetadata(II); return IC.replaceInstUsesWith(II, MaskedLoad); } @@ -883,12 +942,14 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); if (isAllActivePredicate(Pred)) { - Builder.CreateStore(VecOp, VecPtr); + StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); + Store->copyMetadata(II); return IC.eraseInstFromFunction(II); } - Builder.CreateMaskedStore(VecOp, VecPtr, PtrOp->getPointerAlignment(DL), - Pred); + CallInst *MaskedStore = Builder.CreateMaskedStore( + VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); + MaskedStore->copyMetadata(II); return IC.eraseInstFromFunction(II); } @@ -1069,7 +1130,6 @@ static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, Value *BasePtr = II.getOperand(1); Value *Index = II.getOperand(2); Type *Ty = II.getType(); - Type *BasePtrTy = BasePtr->getType(); Value *PassThru = ConstantAggregateZero::get(Ty); // Contiguous gather => masked load. @@ -1085,8 +1145,8 @@ static Optional<Instruction *> instCombineLD1GatherIndex(InstCombiner &IC, BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); Type *VecPtrTy = PointerType::getUnqual(Ty); - Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr, - IndexBase); + Value *Ptr = Builder.CreateGEP( + cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); CallInst *MaskedLoad = Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); @@ -1104,10 +1164,9 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, Value *BasePtr = II.getOperand(2); Value *Index = II.getOperand(3); Type *Ty = Val->getType(); - Type *BasePtrTy = BasePtr->getType(); // Contiguous scatter => masked store. - // (sve.ld1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) + // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) // => (masked.store Value (gep BasePtr IndexBase) Align Mask) Value *IndexBase; if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( @@ -1118,8 +1177,8 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, Align Alignment = BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); - Value *Ptr = Builder.CreateGEP(BasePtrTy->getPointerElementType(), BasePtr, - IndexBase); + Value *Ptr = Builder.CreateGEP( + cast<VectorType>(Ty)->getElementType(), BasePtr, IndexBase); Type *VecPtrTy = PointerType::getUnqual(Ty); Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); @@ -1165,6 +1224,52 @@ static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, return None; } +static Optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, + IntrinsicInst &II) { + Value *A = II.getArgOperand(0); + Value *B = II.getArgOperand(1); + if (A == B) + return IC.replaceInstUsesWith(II, A); + + return None; +} + +static Optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, + IntrinsicInst &II) { + IRBuilder<> Builder(&II); + Value *Pred = II.getOperand(0); + Value *Vec = II.getOperand(1); + Value *Shift = II.getOperand(2); + + // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. + Value *AbsPred, *MergedValue; + if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( + m_Value(MergedValue), m_Value(AbsPred), m_Value())) && + !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( + m_Value(MergedValue), m_Value(AbsPred), m_Value()))) + + return None; + + // Transform is valid if any of the following are true: + // * The ABS merge value is an undef or non-negative + // * The ABS predicate is all active + // * The ABS predicate and the SRSHL predicates are the same + if (!isa<UndefValue>(MergedValue) && + !match(MergedValue, m_NonNegative()) && + AbsPred != Pred && !isAllActivePredicate(AbsPred)) + return None; + + // Only valid when the shift amount is non-negative, otherwise the rounding + // behaviour of SRSHL cannot be ignored. + if (!match(Shift, m_NonNegative())) + return None; + + auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, + {Pred, Vec, Shift}); + + return IC.replaceInstUsesWith(II, LSL); +} + Optional<Instruction *> AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { @@ -1172,6 +1277,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, switch (IID) { default: break; + case Intrinsic::aarch64_neon_fmaxnm: + case Intrinsic::aarch64_neon_fminnm: + return instCombineMaxMinNM(IC, II); case Intrinsic::aarch64_sve_convert_from_svbool: return instCombineConvertFromSVBool(IC, II); case Intrinsic::aarch64_sve_dup: @@ -1227,6 +1335,10 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVEST1(IC, II, DL); case Intrinsic::aarch64_sve_sdiv: return instCombineSVESDIV(IC, II); + case Intrinsic::aarch64_sve_sel: + return instCombineSVESel(IC, II); + case Intrinsic::aarch64_sve_srshl: + return instCombineSVESrshl(IC, II); } return None; @@ -1262,7 +1374,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, ArrayRef<const Value *> Args) { // A helper that returns a vector type from the given type. The number of - // elements in type Ty determine the vector width. + // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { return VectorType::get(ArgTy->getScalarType(), cast<VectorType>(DstTy)->getElementCount()); @@ -1277,26 +1389,32 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the // instructions. // - // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we + // TODO: Add additional widening operations (e.g., shl, etc.) once we // verify that their extending operands are eliminated during code // generation. switch (Opcode) { case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Mul: // SMULL(2), UMULL(2) break; default: return false; } // To be a widening instruction (either the "wide" or "long" versions), the - // second operand must be a sign- or zero extend having a single user. We - // only consider extends having a single user because they may otherwise not - // be eliminated. + // second operand must be a sign- or zero extend. if (Args.size() != 2 || - (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || - !Args[1]->hasOneUse()) + (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) return false; auto *Extend = cast<CastInst>(Args[1]); + auto *Arg0 = dyn_cast<CastInst>(Args[0]); + + // A mul only has a mull version (not like addw). Both operands need to be + // extending and the same type. + if (Opcode == Instruction::Mul && + (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || + Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) + return false; // Legalize the destination type and ensure it can be used in a widening // operation. @@ -1334,7 +1452,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, // If the cast is observable, and it is used by a widening instruction (e.g., // uaddl, saddw, etc.), it may be free. - if (I && I->hasOneUse()) { + if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { @@ -1606,6 +1724,36 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, SrcTy.getSimpleVT())) return AdjustCost(Entry->Cost); + static const TypeConversionCostTblEntry FP16Tbl[] = { + {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs + {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, + {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs + {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, + {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs + {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, + {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn + {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, + {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs + {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, + {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs + {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, + {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn + {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, + {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs + {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, + {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs + {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, + {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf + {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf + {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf + {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf + }; + + if (ST->hasFullFP16()) + if (const auto *Entry = ConvertCostTableLookup( + FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return AdjustCost(Entry->Cost); + return AdjustCost( BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } @@ -1723,24 +1871,12 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // Legalize the type. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - - // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), - // add in the widening overhead specified by the sub-target. Since the - // extends feeding widening instructions are performed automatically, they - // aren't present in the generated code and have a zero cost. By adding a - // widening overhead here, we attach the total cost of the combined operation - // to the widening instruction. - InstructionCost Cost = 0; - if (isWideningInstruction(Ty, Opcode, Args)) - Cost += ST->getWideningBaseCost(); - int ISD = TLI->InstructionOpcodeToISD(Opcode); switch (ISD) { default: - return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, - Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); case ISD::SDIV: if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { @@ -1748,26 +1884,22 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, - Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, - Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, + InstructionCost Cost = getArithmeticInstrCost( + Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Opd1Info, + Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind, - Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, - Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, + Cost += getArithmeticInstrCost( + Instruction::Select, Ty, CostKind, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Opd1Info, + Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); return Cost; } LLVM_FALLTHROUGH; - case ISD::UDIV: + case ISD::UDIV: { if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { auto VT = TLI->getValueType(DL, Ty); if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { @@ -1787,9 +1919,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( } } - Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, - Opd2Info, - Opd1PropInfo, Opd2PropInfo); + InstructionCost Cost = BaseT::getArithmeticInstrCost( + Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); if (Ty->isVectorTy()) { // On AArch64, vector divisions are not supported natively and are // expanded into scalar divisions of each pair of elements. @@ -1804,27 +1935,31 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( Cost += Cost; } return Cost; - + } case ISD::MUL: - if (LT.second != MVT::v2i64) - return (Cost + 1) * LT.first; // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive // as elements are extracted from the vectors and the muls scalarized. // As getScalarizationOverhead is a bit too pessimistic, we estimate the // cost for a i64 vector directly here, which is: - // - four i64 extracts, - // - two i64 inserts, and - // - two muls. - // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with - // LT.first = 2 the cost is 16. - return LT.first * 8; + // - four 2-cost i64 extracts, + // - two 2-cost i64 inserts, and + // - two 1-cost muls. + // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with + // LT.first = 2 the cost is 28. If both operands are extensions it will not + // need to scalarize so the cost can be cheaper (smull or umull). + if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + return LT.first; + return LT.first * 14; case ISD::ADD: case ISD::XOR: case ISD::OR: case ISD::AND: + case ISD::SRL: + case ISD::SRA: + case ISD::SHL: // These nodes are marked as 'custom' for combining purposes only. // We know that they are legal. See LowerAdd in ISelLowering. - return (Cost + 1) * LT.first; + return LT.first; case ISD::FADD: case ISD::FSUB: @@ -1834,11 +1969,10 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // These nodes are marked as 'custom' just to lower them to SVE. // We know said lowering will incur no additional cost. if (!Ty->getScalarType()->isFP128Ty()) - return (Cost + 2) * LT.first; + return 2 * LT.first; - return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, - Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); } } @@ -1946,6 +2080,10 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { return Options; } +bool AArch64TTIImpl::prefersVectorizedAddressing() const { + return ST->hasSVE(); +} + InstructionCost AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, @@ -2559,11 +2697,97 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, int Index, - VectorType *SubTp) { + VectorType *SubTp, + ArrayRef<const Value *> Args) { + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + // If we have a Mask, and the LT is being legalized somehow, split the Mask + // into smaller vectors and sum the cost of each shuffle. + if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && + Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && + cast<FixedVectorType>(Tp)->getNumElements() > + LT.second.getVectorNumElements() && + !Index && !SubTp) { + unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); + assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!"); + unsigned LTNumElts = LT.second.getVectorNumElements(); + unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; + VectorType *NTp = + VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); + InstructionCost Cost; + for (unsigned N = 0; N < NumVecs; N++) { + SmallVector<int> NMask; + // Split the existing mask into chunks of size LTNumElts. Track the source + // sub-vectors to ensure the result has at most 2 inputs. + unsigned Source1, Source2; + unsigned NumSources = 0; + for (unsigned E = 0; E < LTNumElts; E++) { + int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] + : UndefMaskElem; + if (MaskElt < 0) { + NMask.push_back(UndefMaskElem); + continue; + } + + // Calculate which source from the input this comes from and whether it + // is new to us. + unsigned Source = MaskElt / LTNumElts; + if (NumSources == 0) { + Source1 = Source; + NumSources = 1; + } else if (NumSources == 1 && Source != Source1) { + Source2 = Source; + NumSources = 2; + } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { + NumSources++; + } + + // Add to the new mask. For the NumSources>2 case these are not correct, + // but are only used for the modular lane number. + if (Source == Source1) + NMask.push_back(MaskElt % LTNumElts); + else if (Source == Source2) + NMask.push_back(MaskElt % LTNumElts + LTNumElts); + else + NMask.push_back(MaskElt % LTNumElts); + } + // If the sub-mask has at most 2 input sub-vectors then re-cost it using + // getShuffleCost. If not then cost it using the worst case. + if (NumSources <= 2) + Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc + : TTI::SK_PermuteTwoSrc, + NTp, NMask, 0, nullptr, Args); + else if (any_of(enumerate(NMask), [&](const auto &ME) { + return ME.value() % LTNumElts == ME.index(); + })) + Cost += LTNumElts - 1; + else + Cost += LTNumElts; + } + return Cost; + } + Kind = improveShuffleKindFromMask(Kind, Mask); + + // Check for broadcast loads. + if (Kind == TTI::SK_Broadcast) { + bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); + if (IsLoad && LT.second.isVector() && + isLegalBroadcastLoad(Tp->getElementType(), + LT.second.getVectorElementCount())) + return 0; // broadcast is handled by ld1r + } + + // If we have 4 elements for the shuffle and a Mask, get the cost straight + // from the perfect shuffle tables. + if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && + (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && + all_of(Mask, [](int E) { return E < 8; })) + return getPerfectShuffleCost(Mask); + if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_Reverse) { + static const CostTblEntry ShuffleTbl[] = { // Broadcast shuffle kinds can be performed with 'dup'. { TTI::SK_Broadcast, MVT::v8i8, 1 }, @@ -2618,6 +2842,12 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. + { TTI::SK_Reverse, MVT::v8f16, 2 }, // REV64; EXT + { TTI::SK_Reverse, MVT::v8i16, 2 }, // REV64; EXT + { TTI::SK_Reverse, MVT::v16i8, 2 }, // REV64; EXT + { TTI::SK_Reverse, MVT::v4f16, 1 }, // REV64 + { TTI::SK_Reverse, MVT::v4i16, 1 }, // REV64 + { TTI::SK_Reverse, MVT::v8i8, 1 }, // REV64 // Broadcast shuffle kinds for scalable vectors { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, @@ -2655,11 +2885,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, { TTI::SK_Reverse, MVT::nxv4i1, 1 }, { TTI::SK_Reverse, MVT::nxv2i1, 1 }, }; - std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; } + if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) return getSpliceCost(Tp, Index); + + // Inserting a subvector can often be done with either a D, S or H register + // move, so long as the inserted vector is "aligned". + if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && + LT.second.getSizeInBits() <= 128 && SubTp) { + std::pair<InstructionCost, MVT> SubLT = + TLI->getTypeLegalizationCost(DL, SubTp); + if (SubLT.second.isVector()) { + int NumElts = LT.second.getVectorNumElements(); + int NumSubElts = SubLT.second.getVectorNumElements(); + if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) + return SubLT.first; + } + } + return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index a6029b9f2445..d0aacb457a39 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -135,6 +135,8 @@ public: return ST->getVScaleForTuning(); } + bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; + /// Try to return an estimate cost factor that can be used as a multiplier /// when scalarizing an operation for a vector with ElementCount \p VF. /// For scalable vectors this currently takes the most pessimistic view based @@ -148,6 +150,8 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); + bool prefersVectorizedAddressing() const; + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); @@ -278,6 +282,23 @@ public: return isLegalMaskedGatherScatter(DataType); } + bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const { + // Return true if we can generate a `ld1r` splat load instruction. + if (!ST->hasNEON() || NumElements.isScalable()) + return false; + switch (unsigned ElementBits = ElementTy->getScalarSizeInBits()) { + case 8: + case 16: + case 32: + case 64: { + // We accept bit-widths >= 64bits and elements {8,16,32,64} bits. + unsigned VectorBits = NumElements.getFixedValue() * ElementBits; + return VectorBits >= 64; + } + } + return false; + } + bool isLegalNTStore(Type *DataType, Align Alignment) { // NOTE: The logic below is mostly geared towards LV, which calls it with // vectors with 2 elements. We might want to improve that, if other @@ -330,7 +351,8 @@ public: InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, int Index, - VectorType *SubTp); + VectorType *SubTp, + ArrayRef<const Value *> Args = None); /// @} }; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 33ed7ae9780e..ade23f643538 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -127,7 +127,7 @@ private: return Prefix; } - PrefixInfo() : Active(false), Predicated(false) {} + PrefixInfo() = default; bool isActive() const { return Active; } bool isPredicated() const { return Predicated; } unsigned getElementSize() const { @@ -141,8 +141,8 @@ private: } private: - bool Active; - bool Predicated; + bool Active = false; + bool Predicated = false; unsigned ElementSize; unsigned Dst; unsigned Pg; @@ -157,7 +157,8 @@ private: bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S); - AArch64CC::CondCode parseCondCodeString(StringRef Cond); + AArch64CC::CondCode parseCondCodeString(StringRef Cond, + std::string &Suggestion); bool parseCondCode(OperandVector &Operands, bool invertCondCode); unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind); bool parseRegister(OperandVector &Operands); @@ -189,6 +190,7 @@ private: bool parseDirectiveUnreq(SMLoc L); bool parseDirectiveCFINegateRAState(); bool parseDirectiveCFIBKeyFrame(); + bool parseDirectiveCFIMTETaggedFrame(); bool parseDirectiveVariantPCS(SMLoc L); @@ -2425,7 +2427,7 @@ static Optional<std::pair<int, int>> parseVectorKind(StringRef Suffix, } static bool isValidVectorKind(StringRef Suffix, RegKind VectorKind) { - return parseVectorKind(Suffix, VectorKind).hasValue(); + return parseVectorKind(Suffix, VectorKind).has_value(); } static unsigned matchSVEDataVectorRegName(StringRef Name) { @@ -2758,8 +2760,8 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { } auto PRFM = LookupByEncoding(MCE->getValue()); - Operands.push_back(AArch64Operand::CreatePrefetch( - prfop, PRFM.getValueOr(""), S, getContext())); + Operands.push_back(AArch64Operand::CreatePrefetch(prfop, PRFM.value_or(""), + S, getContext())); return MatchOperand_Success; } @@ -3029,8 +3031,10 @@ AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) { return MatchOperand_Success; } -/// parseCondCodeString - Parse a Condition Code string. -AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) { +/// parseCondCodeString - Parse a Condition Code string, optionally returning a +/// suggestion to help common typos. +AArch64CC::CondCode +AArch64AsmParser::parseCondCodeString(StringRef Cond, std::string &Suggestion) { AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower()) .Case("eq", AArch64CC::EQ) .Case("ne", AArch64CC::NE) @@ -3053,7 +3057,7 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) { .Default(AArch64CC::Invalid); if (CC == AArch64CC::Invalid && - getSTI().getFeatureBits()[AArch64::FeatureSVE]) + getSTI().getFeatureBits()[AArch64::FeatureSVE]) { CC = StringSwitch<AArch64CC::CondCode>(Cond.lower()) .Case("none", AArch64CC::EQ) .Case("any", AArch64CC::NE) @@ -3067,6 +3071,9 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) { .Case("tstop", AArch64CC::LT) .Default(AArch64CC::Invalid); + if (CC == AArch64CC::Invalid && Cond.lower() == "nfirst") + Suggestion = "nfrst"; + } return CC; } @@ -3078,9 +3085,14 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands, assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); StringRef Cond = Tok.getString(); - AArch64CC::CondCode CC = parseCondCodeString(Cond); - if (CC == AArch64CC::Invalid) - return TokError("invalid condition code"); + std::string Suggestion; + AArch64CC::CondCode CC = parseCondCodeString(Cond, Suggestion); + if (CC == AArch64CC::Invalid) { + std::string Msg = "invalid condition code"; + if (!Suggestion.empty()) + Msg += ", did you mean " + Suggestion + "?"; + return TokError(Msg); + } Lex(); // Eat identifier token. if (invertCondCode) { @@ -3910,7 +3922,6 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) { const MCRegisterInfo *RI = getContext().getRegisterInfo(); unsigned PrevReg = FirstReg; - unsigned Count = 1; SmallSet<unsigned, 8> DRegs; AArch64Operand::ComputeRegsForAlias(FirstReg, DRegs, ElementWidth); @@ -3942,7 +3953,6 @@ AArch64AsmParser::tryParseMatrixTileList(OperandVector &Operands) { } PrevReg = Reg; - ++Count; } if (parseToken(AsmToken::RCurly, "'}' expected")) @@ -4545,9 +4555,14 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + (Head.data() - Name.data())); - AArch64CC::CondCode CC = parseCondCodeString(Head); - if (CC == AArch64CC::Invalid) - return Error(SuffixLoc, "invalid condition code"); + std::string Suggestion; + AArch64CC::CondCode CC = parseCondCodeString(Head, Suggestion); + if (CC == AArch64CC::Invalid) { + std::string Msg = "invalid condition code"; + if (!Suggestion.empty()) + Msg += ", did you mean " + Suggestion + "?"; + return Error(SuffixLoc, Msg); + } Operands.push_back(AArch64Operand::CreateToken(".", SuffixLoc, getContext(), /*IsSuffix=*/true)); Operands.push_back( @@ -6024,6 +6039,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveCFINegateRAState(); else if (IDVal == ".cfi_b_key_frame") parseDirectiveCFIBKeyFrame(); + else if (IDVal == ".cfi_mte_tagged_frame") + parseDirectiveCFIMTETaggedFrame(); else if (IDVal == ".arch_extension") parseDirectiveArchExtension(Loc); else if (IDVal == ".variant_pcs") @@ -6198,12 +6215,11 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { if (Extension.Features.none()) report_fatal_error("unsupported architectural extension: " + Name); - FeatureBitset ToggleFeatures = EnableFeature - ? (~Features & Extension.Features) - : ( Features & Extension.Features); - FeatureBitset Features = - ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); - setAvailableFeatures(Features); + FeatureBitset ToggleFeatures = + EnableFeature + ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) + : STI.ToggleFeature(Features & Extension.Features); + setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); break; } } @@ -6217,8 +6233,7 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) { StringRef Name = getParser().parseStringToEndOfStatement().trim(); - if (parseToken(AsmToken::EndOfStatement, - "unexpected token in '.arch_extension' directive")) + if (parseEOL()) return true; bool EnableFeature = true; @@ -6236,12 +6251,11 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) { if (Extension.Features.none()) return Error(ExtLoc, "unsupported architectural extension: " + Name); - FeatureBitset ToggleFeatures = EnableFeature - ? (~Features & Extension.Features) - : (Features & Extension.Features); - FeatureBitset Features = - ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); - setAvailableFeatures(Features); + FeatureBitset ToggleFeatures = + EnableFeature + ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) + : STI.ToggleFeature(Features & Extension.Features); + setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); return false; } @@ -6281,7 +6295,6 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions); - FeatureBitset Features = STI.getFeatureBits(); for (auto Name : RequestedExtensions) { // Advance source location past '+'. CurLoc = incrementLoc(CurLoc, 1); @@ -6301,12 +6314,12 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { if (Extension.Features.none()) report_fatal_error("unsupported architectural extension: " + Name); - FeatureBitset ToggleFeatures = EnableFeature - ? (~Features & Extension.Features) - : ( Features & Extension.Features); - FeatureBitset Features = - ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); - setAvailableFeatures(Features); + FeatureBitset Features = STI.getFeatureBits(); + FeatureBitset ToggleFeatures = + EnableFeature + ? STI.SetFeatureBitsTransitively(~Features & Extension.Features) + : STI.ToggleFeature(Features & Extension.Features); + setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures)); FoundExtension = true; break; @@ -6401,12 +6414,10 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { if (Idx + 1 == NbArgs) break; - if (parseToken(AsmToken::Comma, - "unexpected token in '" + Twine(IDVal) + "' directive")) + if (parseComma()) return true; } - if (parseToken(AsmToken::EndOfStatement, - "unexpected token in '" + Twine(IDVal) + "' directive")) + if (parseEOL()) return true; getStreamer().emitLOHDirective((MCLOHType)Kind, Args); @@ -6416,7 +6427,7 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { /// parseDirectiveLtorg /// ::= .ltorg | .pool bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) { - if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive")) + if (parseEOL()) return true; getTargetStreamer().emitCurrentConstantPool(); return false; @@ -6474,8 +6485,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { return Error(SRegLoc, "register name or alias expected"); // Shouldn't be anything else. - if (parseToken(AsmToken::EndOfStatement, - "unexpected input in .req directive")) + if (parseEOL()) return true; auto pair = std::make_pair(RegisterKind, (unsigned) RegNum); @@ -6496,7 +6506,7 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { } bool AArch64AsmParser::parseDirectiveCFINegateRAState() { - if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive")) + if (parseEOL()) return true; getStreamer().emitCFINegateRAState(); return false; @@ -6505,31 +6515,31 @@ bool AArch64AsmParser::parseDirectiveCFINegateRAState() { /// parseDirectiveCFIBKeyFrame /// ::= .cfi_b_key bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() { - if (parseToken(AsmToken::EndOfStatement, - "unexpected token in '.cfi_b_key_frame'")) + if (parseEOL()) return true; getStreamer().emitCFIBKeyFrame(); return false; } +/// parseDirectiveCFIMTETaggedFrame +/// ::= .cfi_mte_tagged_frame +bool AArch64AsmParser::parseDirectiveCFIMTETaggedFrame() { + if (parseEOL()) + return true; + getStreamer().emitCFIMTETaggedFrame(); + return false; +} + /// parseDirectiveVariantPCS /// ::= .variant_pcs symbolname bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) { - const AsmToken &Tok = getTok(); - if (Tok.isNot(AsmToken::Identifier)) + StringRef Name; + if (getParser().parseIdentifier(Name)) return TokError("expected symbol name"); - - StringRef SymbolName = Tok.getIdentifier(); - - MCSymbol *Sym = getContext().lookupSymbol(SymbolName); - if (!Sym) - return TokError("unknown symbol"); - - Lex(); // Eat the symbol - if (parseEOL()) return true; - getTargetStreamer().emitDirectiveVariantPCS(Sym); + getTargetStreamer().emitDirectiveVariantPCS( + getContext().getOrCreateSymbol(Name)); return false; } @@ -6880,7 +6890,7 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, // as a literal token. if (Op.isTokenEqual("za")) return Match_Success; - break; + return Match_InvalidOperand; } if (!Op.isImm()) return Match_InvalidOperand; diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 9ce00f76d9c7..1b65589416c3 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -16,9 +16,10 @@ #include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" #include "llvm-c/Disassembler.h" +#include "llvm/MC/MCDecoderOps.h" #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" -#include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" @@ -37,213 +38,226 @@ using DecodeStatus = MCDisassembler::DecodeStatus; // Forward declare these because the autogenerated code will reference them. // Definitions are further down. -static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, - unsigned RegNo, +static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder); +static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); +static DecodeStatus +DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); +static DecodeStatus +DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, + const MCDisassembler *Decoder); +static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder); +static DecodeStatus +DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); +static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); template <unsigned NumBitsForTile> static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst, - unsigned RegMask, - uint64_t Address, - const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); +static DecodeStatus +DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask, + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); +static DecodeStatus +DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); +static DecodeStatus +DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); +static DecodeStatus +DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); +static DecodeStatus +DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Addr, - const void *Decoder); -static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Addr, - const void *Decoder); -static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); +static DecodeStatus +DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, + const MCDisassembler *Decoder); +static DecodeStatus +DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, + const MCDisassembler *Decoder); +static DecodeStatus +DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, + const MCDisassembler *Decoder); template <int Bits> static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); template <int ElementWidth> -static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); + uint64_t Addr, + const MCDisassembler *Decoder); static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder); + const MCDisassembler *Decoder); static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder); + const MCDisassembler *Decoder); static bool Check(DecodeStatus &Out, DecodeStatus In) { switch (In) { @@ -270,7 +284,8 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) { static MCDisassembler *createAArch64Disassembler(const Target &T, const MCSubtargetInfo &STI, MCContext &Ctx) { - return new AArch64Disassembler(STI, Ctx); + + return new AArch64Disassembler(STI, Ctx, T.createMCInstrInfo()); } DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, @@ -295,67 +310,37 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, DecodeStatus Result = decodeInstruction(Table, MI, Insn, Address, this, STI); - switch (MI.getOpcode()) { - default: - break; + const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); + // For Scalable Matrix Extension (SME) instructions that have an implicit - // operand for the accumulator (ZA) which isn't encoded, manually insert - // operand. - case AArch64::LDR_ZA: - case AArch64::STR_ZA: { - MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZA)); - // Spill and fill instructions have a single immediate used for both the - // vector select offset and optional memory offset. Replicate the decoded - // immediate. + // operand for the accumulator (ZA) or implicit immediate zero which isn't + // encoded, manually insert operand. + for (unsigned i = 0; i < Desc.getNumOperands(); i++) { + if (Desc.OpInfo[i].OperandType == MCOI::OPERAND_REGISTER) { + switch (Desc.OpInfo[i].RegClass) { + default: + break; + case AArch64::MPRRegClassID: + MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZA)); + break; + case AArch64::MPR8RegClassID: + MI.insert(MI.begin() + i, MCOperand::createReg(AArch64::ZAB0)); + break; + } + } else if (Desc.OpInfo[i].OperandType == + AArch64::OPERAND_IMPLICIT_IMM_0) { + MI.insert(MI.begin() + i, MCOperand::createImm(0)); + } + } + + if (MI.getOpcode() == AArch64::LDR_ZA || + MI.getOpcode() == AArch64::STR_ZA) { + // Spill and fill instructions have a single immediate used for both + // the vector select offset and optional memory offset. Replicate + // the decoded immediate. const MCOperand &Imm4Op = MI.getOperand(2); assert(Imm4Op.isImm() && "Unexpected operand type!"); MI.addOperand(Imm4Op); - break; - } - case AArch64::LD1_MXIPXX_H_B: - case AArch64::LD1_MXIPXX_V_B: - case AArch64::ST1_MXIPXX_H_B: - case AArch64::ST1_MXIPXX_V_B: - case AArch64::INSERT_MXIPZ_H_B: - case AArch64::INSERT_MXIPZ_V_B: - // e.g. - // MOVA ZA0<HV>.B[<Ws>, <imm>], <Pg>/M, <Zn>.B - // ^ insert implicit 8-bit element tile - MI.insert(MI.begin(), MCOperand::createReg(AArch64::ZAB0)); - break; - case AArch64::EXTRACT_ZPMXI_H_B: - case AArch64::EXTRACT_ZPMXI_V_B: - // MOVA <Zd>.B, <Pg>/M, ZA0<HV>.B[<Ws>, <imm>] - // ^ insert implicit 8-bit element tile - MI.insert(MI.begin()+2, MCOperand::createReg(AArch64::ZAB0)); - break; - case AArch64::LD1_MXIPXX_H_Q: - case AArch64::LD1_MXIPXX_V_Q: - case AArch64::ST1_MXIPXX_H_Q: - case AArch64::ST1_MXIPXX_V_Q: - // 128-bit load/store have implicit zero vector index. - MI.insert(MI.begin()+2, MCOperand::createImm(0)); - break; - // 128-bit mova have implicit zero vector index. - case AArch64::INSERT_MXIPZ_H_Q: - case AArch64::INSERT_MXIPZ_V_Q: - MI.insert(MI.begin()+2, MCOperand::createImm(0)); - break; - case AArch64::EXTRACT_ZPMXI_H_Q: - case AArch64::EXTRACT_ZPMXI_V_Q: - MI.addOperand(MCOperand::createImm(0)); - break; - case AArch64::SMOVvi8to32_idx0: - case AArch64::SMOVvi8to64_idx0: - case AArch64::SMOVvi16to32_idx0: - case AArch64::SMOVvi16to64_idx0: - case AArch64::SMOVvi32to64_idx0: - case AArch64::UMOVvi8_idx0: - case AArch64::UMOVvi16_idx0: - case AArch64::UMOVvi32_idx0: - case AArch64::UMOVvi64_idx0: - MI.addOperand(MCOperand::createImm(0)); - break; } if (Result != MCDisassembler::Fail) @@ -400,7 +385,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() { static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -410,9 +395,9 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 15) return Fail; return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder); @@ -420,7 +405,7 @@ static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -432,7 +417,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -444,7 +429,7 @@ static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -456,7 +441,7 @@ static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -466,9 +451,9 @@ static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 30) return Fail; @@ -481,7 +466,7 @@ static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -491,10 +476,9 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder) { +static DecodeStatus +DecodeGPR64x8ClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, + const MCDisassembler *Decoder) { if (RegNo > 22) return Fail; if (RegNo & 1) @@ -509,7 +493,7 @@ static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -518,10 +502,10 @@ static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 3) return Fail; @@ -534,7 +518,7 @@ static DecodeStatus DecodeMatrixIndexGPR32_12_15RegisterClass(MCInst &Inst, static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -546,7 +530,7 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -558,7 +542,7 @@ static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void* Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; @@ -570,7 +554,7 @@ static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 15) return Fail; return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder); @@ -578,7 +562,7 @@ static DecodeStatus DecodeZPR_4bRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 7) return Fail; return DecodeZPRRegisterClass(Inst, RegNo, Address, Decoder); @@ -586,7 +570,7 @@ static DecodeStatus DecodeZPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void* Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -597,7 +581,7 @@ static DecodeStatus DecodeZPR2RegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void* Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -608,7 +592,7 @@ static DecodeStatus DecodeZPR3RegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - const void* Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -617,10 +601,10 @@ static DecodeStatus DecodeZPR4RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeMatrixTileListRegisterClass(MCInst &Inst, - unsigned RegMask, - uint64_t Address, - const void *Decoder) { +static DecodeStatus +DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask, + uint64_t Address, + const MCDisassembler *Decoder) { if (RegMask > 0xFF) return Fail; Inst.addOperand(MCOperand::createImm(RegMask)); @@ -641,7 +625,8 @@ static const SmallVector<SmallVector<unsigned, 16>, 5> template <unsigned NumBitsForTile> static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder) { + uint64_t Address, + const MCDisassembler *Decoder) { unsigned LastReg = (1 << NumBitsForTile) - 1; if (RegNo > LastReg) return Fail; @@ -651,7 +636,8 @@ static DecodeStatus DecodeMatrixTile(MCInst &Inst, unsigned RegNo, } static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 15) return Fail; @@ -663,7 +649,7 @@ static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void* Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 7) return Fail; @@ -672,7 +658,8 @@ static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo, } static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -682,7 +669,8 @@ static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, } static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -693,7 +681,7 @@ static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -703,7 +691,8 @@ static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, } static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -713,7 +702,8 @@ static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, } static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -724,7 +714,7 @@ static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { if (RegNo > 31) return Fail; unsigned Register = @@ -735,7 +725,7 @@ static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { // scale{5} is asserted as 1 in tblgen. Imm |= 0x20; Inst.addOperand(MCOperand::createImm(64 - Imm)); @@ -744,29 +734,29 @@ static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createImm(64 - Imm)); return Success; } static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { int64_t ImmVal = Imm; - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); // Sign-extend 19-bit immediate. if (ImmVal & (1 << (19 - 1))) ImmVal |= ~((1LL << 19) - 1); - if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal * 4, Addr, - Inst.getOpcode() != AArch64::LDRXl, 0, 4)) + if (!Decoder->tryAddingSymbolicOperand( + Inst, ImmVal * 4, Addr, Inst.getOpcode() != AArch64::LDRXl, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(ImmVal)); return Success; } static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder) { + uint64_t Address, + const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createImm((Imm >> 1) & 1)); Inst.addOperand(MCOperand::createImm(Imm & 1)); return Success; @@ -774,7 +764,7 @@ static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm, static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm, uint64_t Address, - const void *Decoder) { + const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createImm(Imm)); // Every system register in the encoding space is valid with the syntax @@ -784,7 +774,7 @@ static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm, static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm, uint64_t Address, - const void *Decoder) { + const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createImm(Imm)); return Success; @@ -792,7 +782,7 @@ static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm, static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, - const void *Decoder) { + const MCDisassembler *Decoder) { // This decoder exists to add the dummy Lane operand to the MCInst, which must // be 1 in assembly but has no other real manifestation. unsigned Rd = fieldFromInstruction(Insn, 0, 5); @@ -826,66 +816,74 @@ static DecodeStatus DecodeVecShiftLImm(MCInst &Inst, unsigned Imm, } static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 64); } static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { return DecodeVecShiftRImm(Inst, Imm | 0x20, 64); } static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 32); } static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { return DecodeVecShiftRImm(Inst, Imm | 0x10, 32); } static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 16); } static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { return DecodeVecShiftRImm(Inst, Imm | 0x8, 16); } static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 8); } static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 64); } static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 32); } static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 16); } static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 8); } -static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned Rm = fieldFromInstruction(insn, 16, 5); @@ -947,7 +945,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned imm = fieldFromInstruction(insn, 5, 16); unsigned shift = fieldFromInstruction(insn, 21, 2); @@ -978,14 +976,12 @@ static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, + const MCDisassembler *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned offset = fieldFromInstruction(insn, 10, 12); - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); switch (Inst.getOpcode()) { default: @@ -1034,14 +1030,14 @@ static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, } DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4)) + if (!Decoder->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(offset)); return Success; } static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); int64_t offset = fieldFromInstruction(insn, 12, 9); @@ -1237,9 +1233,9 @@ static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, + const MCDisassembler *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned Rt2 = fieldFromInstruction(insn, 10, 5); @@ -1322,7 +1318,7 @@ static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned Rt2 = fieldFromInstruction(insn, 10, 5); @@ -1456,7 +1452,7 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); uint64_t offset = fieldFromInstruction(insn, 22, 1) << 9 | @@ -1489,7 +1485,7 @@ static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned Rm = fieldFromInstruction(insn, 16, 5); @@ -1546,7 +1542,7 @@ static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned Datasize = fieldFromInstruction(insn, 31, 1); @@ -1577,7 +1573,7 @@ static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned cmode = fieldFromInstruction(insn, 12, 4); unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; @@ -1616,7 +1612,7 @@ static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned cmode = fieldFromInstruction(insn, 12, 4); unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; @@ -1633,26 +1629,26 @@ static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn, } static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); int64_t imm = fieldFromInstruction(insn, 5, 19) << 2; imm |= fieldFromInstruction(insn, 29, 2); - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); // Sign-extend the 21-bit immediate. if (imm & (1 << (21 - 1))) imm |= ~((1LL << 21) - 1); DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4)) + if (!Decoder->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(imm)); return Success; } static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); unsigned Imm = fieldFromInstruction(insn, 10, 14); @@ -1661,8 +1657,6 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, unsigned ShifterVal = (Imm >> 12) & 3; unsigned ImmVal = Imm & 0xFFF; - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); if (ShifterVal != 0 && ShifterVal != 1) return Fail; @@ -1681,7 +1675,7 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); } - if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4)) + if (!Decoder->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(ImmVal)); Inst.addOperand(MCOperand::createImm(12 * ShifterVal)); return Success; @@ -1689,24 +1683,22 @@ static DecodeStatus DecodeAddSubImmShift(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { int64_t imm = fieldFromInstruction(insn, 0, 26); - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); // Sign-extend the 26-bit immediate. if (imm & (1 << (26 - 1))) imm |= ~((1LL << 26) - 1); - if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4)) + if (!Decoder->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(imm)); return Success; } -static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, + const MCDisassembler *Decoder) { uint64_t op1 = fieldFromInstruction(insn, 16, 3); uint64_t op2 = fieldFromInstruction(insn, 5, 3); uint64_t crm = fieldFromInstruction(insn, 8, 4); @@ -1726,22 +1718,20 @@ static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, Inst.addOperand(MCOperand::createImm(pstate_field)); Inst.addOperand(MCOperand::createImm(crm)); - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); auto PState = AArch64PState::lookupPStateByEncoding(pstate_field); - if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits())) + if (PState && + PState->haveFeatures(Decoder->getSubtargetInfo().getFeatureBits())) return Success; return Fail; } static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { uint64_t Rt = fieldFromInstruction(insn, 0, 5); uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5; bit |= fieldFromInstruction(insn, 19, 5); int64_t dst = fieldFromInstruction(insn, 5, 14); - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); // Sign-extend 14-bit immediate. if (dst & (1 << (14 - 1))) @@ -1752,17 +1742,16 @@ static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, else DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); Inst.addOperand(MCOperand::createImm(bit)); - if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4)) + if (!Decoder->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 0, 4)) Inst.addOperand(MCOperand::createImm(dst)); return Success; } -static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, - unsigned RegClassID, - unsigned RegNo, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegClassID, + unsigned RegNo, uint64_t Addr, + const MCDisassembler *Decoder) { // Register number must be even (see CASP instruction) if (RegNo & 0x1) return Fail; @@ -1772,27 +1761,25 @@ static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, return Success; } -static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeGPRSeqPairsClassRegisterClass(Inst, AArch64::WSeqPairsClassRegClassID, RegNo, Addr, Decoder); } -static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, + const MCDisassembler *Decoder) { return DecodeGPRSeqPairsClassRegisterClass(Inst, AArch64::XSeqPairsClassRegClassID, RegNo, Addr, Decoder); } -static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, + const MCDisassembler *Decoder) { unsigned Zdn = fieldFromInstruction(insn, 0, 5); unsigned imm = fieldFromInstruction(insn, 5, 13); if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64)) @@ -1808,7 +1795,7 @@ static DecodeStatus DecodeSVELogicalImmInstruction(MCInst &Inst, uint32_t insn, template <int Bits> static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address, - const void *Decoder) { + const MCDisassembler *Decoder) { if (Imm & ~((1LL << Bits) - 1)) return Fail; @@ -1822,8 +1809,8 @@ static DecodeStatus DecodeSImm(MCInst &Inst, uint64_t Imm, uint64_t Address, // Decode 8-bit signed/unsigned immediate for a given element width. template <int ElementWidth> -static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { +static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { unsigned Val = (uint8_t)Imm; unsigned Shift = (Imm & 0x100) ? 8 : 0; if (ElementWidth == 8 && Shift) @@ -1835,13 +1822,14 @@ static DecodeStatus DecodeImm8OptLsl(MCInst &Inst, unsigned Imm, // Decode uimm4 ranged from 1-16. static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createImm(Imm + 1)); return Success; } static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address, - const void *Decoder) { + const MCDisassembler *Decoder) { if (AArch64SVCR::lookupSVCRByEncoding(Imm)) { Inst.addOperand(MCOperand::createImm(Imm)); return Success; @@ -1851,7 +1839,7 @@ static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address, static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rs = fieldFromInstruction(insn, 16, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -1876,7 +1864,7 @@ static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn, static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rm = fieldFromInstruction(insn, 16, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h index 374a89edcb74..6761d449a7f4 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -13,13 +13,17 @@ #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H #include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInstrInfo.h" namespace llvm { class AArch64Disassembler : public MCDisassembler { + std::unique_ptr<const MCInstrInfo> const MCII; + public: - AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx) - : MCDisassembler(STI, Ctx) {} + AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + MCInstrInfo const *MCII) + : MCDisassembler(STI, Ctx), MCII(MCII) {} ~AArch64Disassembler() override = default; diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 5b6f06f8dbb4..11964b2075e5 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -60,7 +60,7 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) { /// an operand to the MCInst and Fail otherwise. bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand( MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address, - bool IsBranch, uint64_t Offset, uint64_t InstSize) { + bool IsBranch, uint64_t Offset, uint64_t OpSize, uint64_t InstSize) { if (!SymbolLookUp) return false; // FIXME: This method shares a lot of code with @@ -73,8 +73,8 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand( SymbolicOp.Value = Value; uint64_t ReferenceType; const char *ReferenceName; - if (!GetOpInfo || - !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) { + if (!GetOpInfo || !GetOpInfo(DisInfo, Address, /*Offset=*/0, OpSize, InstSize, + 1, &SymbolicOp)) { if (IsBranch) { ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h index dc72331660cc..ca677db49739 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h @@ -29,7 +29,8 @@ public: bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address, bool IsBranch, - uint64_t Offset, uint64_t InstSize) override; + uint64_t Offset, uint64_t OpSize, + uint64_t InstSize) override; }; } // namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 097b93e4fcca..89e1d85a6085 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -18,6 +18,7 @@ #include "AArch64Subtarget.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -1058,10 +1059,10 @@ bool AArch64CallLowering::lowerTailCall( // If Callee is a reg, since it is used by a target specific instruction, // it must have a register class matching the constraint of that instruction. - if (Info.Callee.isReg()) + if (MIB->getOperand(0).isReg()) constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), *MF.getSubtarget().getRegBankInfo(), *MIB, - MIB->getDesc(), Info.Callee, 0); + MIB->getDesc(), MIB->getOperand(0), 0); MF.getFrameInfo().setHasTailCall(); Info.LoweredTailCall = true; @@ -1127,14 +1128,39 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); + + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + unsigned Opc = 0; + // Calls with operand bundle "clang.arc.attachedcall" are special. They should + // be expanded to the call, directly followed by a special marker sequence and + // a call to an ObjC library function. + if (Info.CB && objcarc::hasAttachedCallOpBundle(Info.CB)) + Opc = AArch64::BLR_RVMARKER; + // A call to a returns twice function like setjmp must be followed by a bti + // instruction. + else if (Info.CB && + Info.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) && + !Subtarget.noBTIAtReturnTwice() && + MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) + Opc = AArch64::BLR_BTI; + else + Opc = getCallOpcode(MF, Info.Callee.isReg(), false); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + unsigned CalleeOpNo = 0; + + if (Opc == AArch64::BLR_RVMARKER) { + // Add a target global address for the retainRV/claimRV runtime function + // just before the call target. + Function *ARCFn = *objcarc::getAttachedARCFunction(Info.CB); + MIB.addGlobalAddress(ARCFn); + ++CalleeOpNo; + } + MIB.add(Info.Callee); // Tell the call which registers are clobbered. const uint32_t *Mask; - const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const auto *TRI = Subtarget.getRegisterInfo(); AArch64OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg, @@ -1160,10 +1186,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the // constraint of that instruction. - if (Info.Callee.isReg()) + if (MIB->getOperand(CalleeOpNo).isReg()) constrainOperandRegClass(MF, *TRI, MRI, *Subtarget.getInstrInfo(), *Subtarget.getRegBankInfo(), *MIB, MIB->getDesc(), - Info.Callee, 0); + MIB->getOperand(CalleeOpNo), CalleeOpNo); // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 703e356f016d..9a65687735fe 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -21,13 +21,16 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/Optional.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -38,9 +41,9 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" -#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -62,6 +65,7 @@ namespace { #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATE_BITSET + class AArch64InstructionSelector : public InstructionSelector { public: AArch64InstructionSelector(const AArch64TargetMachine &TM, @@ -293,6 +297,20 @@ private: emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; + /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). + /// In some cases this is even possible with OR operations in the expression. + MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC, + MachineIRBuilder &MIB) const; + MachineInstr *emitConditionalComparison(Register LHS, Register RHS, + CmpInst::Predicate CC, + AArch64CC::CondCode Predicate, + AArch64CC::CondCode OutCC, + MachineIRBuilder &MIB) const; + MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC, + bool Negate, Register CCOp, + AArch64CC::CondCode Predicate, + MachineIRBuilder &MIB) const; + /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. /// \p IsNegative is true if the test should be "not zero". /// This will also optimize the test bit instruction when possible. @@ -419,12 +437,16 @@ private: int OpIdx = -1) const; void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; + void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx = -1) const; // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags); // Optimization methods. - bool tryOptSelect(MachineInstr &MI); + bool tryOptSelect(GSelect &Sel); + bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI); MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; @@ -485,9 +507,11 @@ AArch64InstructionSelector::AArch64InstructionSelector( // FIXME: This should be target-independent, inferred from the types declared // for each class in the bank. +// +/// Given a register bank, and a type, return the smallest register class that +/// can represent that combination. static const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, - const RegisterBankInfo &RBI, bool GetAllRegSet = false) { if (RB.getID() == AArch64::GPRRegBankID) { if (Ty.getSizeInBits() <= 32) @@ -828,39 +852,6 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, return GenericOpc; } -#ifndef NDEBUG -/// Helper function that verifies that we have a valid copy at the end of -/// selectCopy. Verifies that the source and dest have the expected sizes and -/// then returns true. -static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, - const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) { - const Register DstReg = I.getOperand(0).getReg(); - const Register SrcReg = I.getOperand(1).getReg(); - const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); - const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - - // Make sure the size of the source and dest line up. - assert( - (DstSize == SrcSize || - // Copies are a mean to setup initial types, the number of - // bits may not exactly match. - (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || - // Copies are a mean to copy bits around, as long as we are - // on the same register class, that's fine. Otherwise, that - // means we need some SUBREG_TO_REG or AND & co. - (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) && - "Copy with different width?!"); - - // Check the size of the destination. - assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) && - "GPRs cannot get more than 64-bit width values"); - - return true; -} -#endif - /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg /// to \p *To. /// @@ -935,31 +926,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return false; } - // A couple helpers below, for making sure that the copy we produce is valid. - - // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want - // to verify that the src and dst are the same size, since that's handled by - // the SUBREG_TO_REG. - bool KnownValid = false; - - // Returns true, or asserts if something we don't expect happens. Instead of - // returning true, we return isValidCopy() to ensure that we verify the - // result. - auto CheckCopy = [&]() { - // If we have a bitcast or something, we can't have physical registers. - assert((I.isCopy() || - (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && - !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && - "No phys reg on generic operator!"); - bool ValidCopy = true; -#ifndef NDEBUG - ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI); - assert(ValidCopy && "Invalid copy."); -#endif - (void)KnownValid; - return ValidCopy; - }; - // Is this a copy? If so, then we may need to insert a subregister copy. if (I.isCopy()) { // Yes. Check if there's anything to fix up. @@ -1004,15 +970,12 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, .addImm(SubReg); MachineOperand &RegOp = I.getOperand(1); RegOp.setReg(PromoteReg); - - // Promise that the copy is implicitly validated by the SUBREG_TO_REG. - KnownValid = true; } // If the destination is a physical register, then there's nothing to // change, so we're done. if (Register::isPhysicalRegister(DstReg)) - return CheckCopy(); + return true; } // No need to constrain SrcReg. It will get constrained when we hit another @@ -1032,7 +995,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, } I.setDesc(TII.get(AArch64::COPY)); - return CheckCopy(); + return true; } static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { @@ -1309,6 +1272,90 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { } } +/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC. +static void changeFPCCToORAArch64CC(CmpInst::Predicate CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (CC) { + default: + llvm_unreachable("Unknown FP condition!"); + case CmpInst::FCMP_OEQ: + CondCode = AArch64CC::EQ; + break; + case CmpInst::FCMP_OGT: + CondCode = AArch64CC::GT; + break; + case CmpInst::FCMP_OGE: + CondCode = AArch64CC::GE; + break; + case CmpInst::FCMP_OLT: + CondCode = AArch64CC::MI; + break; + case CmpInst::FCMP_OLE: + CondCode = AArch64CC::LS; + break; + case CmpInst::FCMP_ONE: + CondCode = AArch64CC::MI; + CondCode2 = AArch64CC::GT; + break; + case CmpInst::FCMP_ORD: + CondCode = AArch64CC::VC; + break; + case CmpInst::FCMP_UNO: + CondCode = AArch64CC::VS; + break; + case CmpInst::FCMP_UEQ: + CondCode = AArch64CC::EQ; + CondCode2 = AArch64CC::VS; + break; + case CmpInst::FCMP_UGT: + CondCode = AArch64CC::HI; + break; + case CmpInst::FCMP_UGE: + CondCode = AArch64CC::PL; + break; + case CmpInst::FCMP_ULT: + CondCode = AArch64CC::LT; + break; + case CmpInst::FCMP_ULE: + CondCode = AArch64CC::LE; + break; + case CmpInst::FCMP_UNE: + CondCode = AArch64CC::NE; + break; + } +} + +/// Convert an IR fp condition code to an AArch64 CC. +/// This differs from changeFPCCToAArch64CC in that it returns cond codes that +/// should be AND'ed instead of OR'ed. +static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (CC) { + default: + changeFPCCToORAArch64CC(CC, CondCode, CondCode2); + assert(CondCode2 == AArch64CC::AL); + break; + case CmpInst::FCMP_ONE: + // (a one b) + // == ((a olt b) || (a ogt b)) + // == ((a ord b) && (a une b)) + CondCode = AArch64CC::VC; + CondCode2 = AArch64CC::NE; + break; + case CmpInst::FCMP_UEQ: + // (a ueq b) + // == ((a uno b) || (a oeq b)) + // == ((a ule b) && (a uge b)) + CondCode = AArch64CC::PL; + CondCode2 = AArch64CC::LE; + break; + } +} + /// Return a register which can be used as a bit to test in a TB(N)Z. static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, MachineRegisterInfo &MRI) { @@ -1703,7 +1750,6 @@ static Optional<int64_t> getVectorShiftImm(Register Reg, MachineRegisterInfo &MRI) { assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); MachineInstr *OpMI = MRI.getVRegDef(Reg); - assert(OpMI && "Expected to find a vreg def for vector shift operand"); return getAArch64VectorSplatScalar(*OpMI, MRI); } @@ -1810,7 +1856,7 @@ bool AArch64InstructionSelector::selectVectorAshrLshr( unsigned Opc = 0; unsigned NegOpc = 0; const TargetRegisterClass *RC = - getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI); + getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID)); if (Ty == LLT::fixed_vector(2, 64)) { Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; NegOpc = AArch64::NEGv2i64; @@ -2266,6 +2312,16 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { I.eraseFromParent(); return true; } + case TargetOpcode::G_FENCE: { + if (I.getOperand(1).getImm() == 0) + BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CompilerBarrier)) + .addImm(I.getOperand(0).getImm()); + else + BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::DMB)) + .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb); + I.eraseFromParent(); + return true; + } default: return false; } @@ -2279,8 +2335,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const AArch64Subtarget *Subtarget = - &static_cast<const AArch64Subtarget &>(MF.getSubtarget()); + const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>(); if (Subtarget->requiresStrictAlign()) { // We don't support this feature yet. LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n"); @@ -2312,7 +2367,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); - DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI); + DefRC = getRegClassForTypeOnBank(DefTy, RB); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); return false; @@ -2488,7 +2543,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // The case when we have 0.0 is covered by tablegen. Reject it here so we // can be sure tablegen works correctly and isn't rescued by this code. - // 0.0 is not covered by tablegen for FP128. So we will handle this + // 0.0 is not covered by tablegen for FP128. So we will handle this // scenario in the code here. if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) return false; @@ -2510,7 +2565,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } if (isFP) { - const TargetRegisterClass &FPRRC = *getMinClassForRegBank(RB, DefSize); + const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB); // For 16, 64, and 128b values, emit a constant pool load. switch (DefSize) { default: @@ -2735,12 +2790,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; if (isa<GLoad>(LdSt)) { - static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH, - AArch64::LDARW, AArch64::LDARX}; + static constexpr unsigned LDAPROpcodes[] = { + AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX}; + static constexpr unsigned LDAROpcodes[] = { + AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX}; + ArrayRef<unsigned> Opcodes = + STI.hasLDAPR() && Order != AtomicOrdering::SequentiallyConsistent + ? LDAPROpcodes + : LDAROpcodes; I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)])); } else { - static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, - AArch64::STLRW, AArch64::STLRX}; + static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH, + AArch64::STLRW, AArch64::STLRX}; Register ValReg = LdSt.getReg(0); if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) { // Emit a subreg copy of 32 bits. @@ -2774,7 +2835,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) { unsigned SubReg; LLT MemTy = LdSt.getMMO().getMemoryType(); - auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI); + auto *RC = getRegClassForTypeOnBank(MemTy, RB); if (!getSubRegForClass(RC, TRI, SubReg)) return false; @@ -2790,7 +2851,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { if (RB.getID() == AArch64::FPRRegBankID) { unsigned SubReg; LLT MemTy = LdSt.getMMO().getMemoryType(); - auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI); + auto *RC = getRegClassForTypeOnBank(MemTy, RB); if (!getSubRegForClass(RC, TRI, SubReg)) return false; Register OldDst = LdSt.getReg(0); @@ -2804,7 +2865,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { .addImm(0) .addUse(NewDst) .addImm(SubReg); - auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB, RBI); + auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB); RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI); MIB.setInstr(LdSt); } @@ -2934,8 +2995,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && ShiftTy.getSizeInBits() == 64) { assert(!ShiftTy.isVector() && "unexpected vector shift ty"); - assert(MRI.getVRegDef(ShiftReg) && - "could not find a vreg definition for shift amount"); // Insert a subregister copy to implement a 64->32 trunc auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {}) .addReg(ShiftReg, 0, AArch64::sub_32); @@ -2944,10 +3003,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } } LLVM_FALLTHROUGH; - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FDIV: case TargetOpcode::G_OR: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) @@ -3026,13 +3081,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } if (DstRB.getID() == AArch64::GPRRegBankID) { - const TargetRegisterClass *DstRC = - getRegClassForTypeOnBank(DstTy, DstRB, RBI); + const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); if (!DstRC) return false; - const TargetRegisterClass *SrcRC = - getRegClassForTypeOnBank(SrcTy, SrcRB, RBI); + const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB); if (!SrcRC) return false; @@ -3270,6 +3323,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { I.setDesc(TII.get(NewOpc)); constrainSelectedInstRegOperands(I, TII, TRI, RBI); + I.setFlags(MachineInstr::NoFPExcept); return true; } @@ -3291,17 +3345,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return selectCopy(I, TII, MRI, TRI, RBI); case TargetOpcode::G_SELECT: { - if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) { + auto &Sel = cast<GSelect>(I); + if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) { LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty << ", expected: " << LLT::scalar(1) << '\n'); return false; } - const Register CondReg = I.getOperand(1).getReg(); - const Register TReg = I.getOperand(2).getReg(); - const Register FReg = I.getOperand(3).getReg(); + const Register CondReg = Sel.getCondReg(); + const Register TReg = Sel.getTrueReg(); + const Register FReg = Sel.getFalseReg(); - if (tryOptSelect(I)) + if (tryOptSelect(Sel)) return true; // Make sure to use an unused vreg instead of wzr, so that the peephole @@ -3310,9 +3365,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); - if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB)) + if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB)) return false; - I.eraseFromParent(); + Sel.eraseFromParent(); return true; } case TargetOpcode::G_ICMP: { @@ -3357,8 +3412,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const Register DstReg = I.getOperand(0).getReg(); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - const TargetRegisterClass *DstRC = - getRegClassForTypeOnBank(DstTy, DstRB, RBI); + const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB); RBI.constrainGenericRegister(DstReg, *DstRC, MRI); return true; } @@ -3871,7 +3925,7 @@ bool AArch64InstructionSelector::selectVectorICmp( const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI); const TargetRegisterClass *SrcRC = - getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true); + getRegClassForTypeOnBank(SrcTy, VecRB, true); if (!SrcRC) { LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); return false; @@ -4037,7 +4091,7 @@ MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( } const TargetRegisterClass *DstRC = - getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true); + getRegClassForTypeOnBank(ScalarTy, DstRB, true); if (!DstRC) { LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n"); return nullptr; @@ -4046,7 +4100,7 @@ MachineInstr *AArch64InstructionSelector::emitExtractVectorElt( const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI); const LLT &VecTy = MRI.getType(VecReg); const TargetRegisterClass *VecRC = - getRegClassForTypeOnBank(VecTy, VecRB, RBI, true); + getRegClassForTypeOnBank(VecTy, VecRB, true); if (!VecRC) { LLVM_DEBUG(dbgs() << "Could not determine source register class.\n"); return nullptr; @@ -4205,9 +4259,9 @@ bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I, } else { // No. We have to perform subregister inserts. For each insert, create an // implicit def and a subregister insert, and save the register we create. - const TargetRegisterClass *RC = - getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI), - WideTy.getScalarSizeInBits() * NumElts); + const TargetRegisterClass *RC = getRegClassForTypeOnBank( + LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()), + *RBI.getRegBank(SrcReg, MRI, TRI)); unsigned SubReg = 0; bool Found = getSubRegForClass(RC, TRI, SubReg); (void)Found; @@ -4594,6 +4648,7 @@ AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS, // Partially build the compare. Decide if we need to add a use for the // third operand based off whether or not we're comparing against 0.0. auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); + CmpMI.setMIFlags(MachineInstr::NoFPExcept); if (!ShouldUseImm) CmpMI.addUse(RHS); constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); @@ -4632,7 +4687,7 @@ MachineInstr *AArch64InstructionSelector::emitVectorConcat( const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits()); const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI); const TargetRegisterClass *DstRC = - getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2); + getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank); MachineInstr *WidenedOp1 = emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder); @@ -4701,7 +4756,256 @@ AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, } } -bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) { +/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be +/// expressed as a conjunction. +/// \param CanNegate Set to true if we can negate the whole sub-tree just by +/// changing the conditions on the CMP tests. +/// (this means we can call emitConjunctionRec() with +/// Negate==true on this sub-tree) +/// \param MustBeFirst Set to true if this subtree needs to be negated and we +/// cannot do the negation naturally. We are required to +/// emit the subtree first in this case. +/// \param WillNegate Is true if are called when the result of this +/// subexpression must be negated. This happens when the +/// outer expression is an OR. We can use this fact to know +/// that we have a double negation (or (or ...) ...) that +/// can be implemented for free. +static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, + bool WillNegate, MachineRegisterInfo &MRI, + unsigned Depth = 0) { + if (!MRI.hasOneNonDBGUse(Val)) + return false; + MachineInstr *ValDef = MRI.getVRegDef(Val); + unsigned Opcode = ValDef->getOpcode(); + if (Opcode == TargetOpcode::G_TRUNC) { + // Look through a trunc. + Val = ValDef->getOperand(1).getReg(); + ValDef = MRI.getVRegDef(Val); + Opcode = ValDef->getOpcode(); + } + if (isa<GAnyCmp>(ValDef)) { + CanNegate = true; + MustBeFirst = false; + return true; + } + // Protect against exponential runtime and stack overflow. + if (Depth > 6) + return false; + if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) { + bool IsOR = Opcode == TargetOpcode::G_OR; + Register O0 = ValDef->getOperand(1).getReg(); + Register O1 = ValDef->getOperand(2).getReg(); + bool CanNegateL; + bool MustBeFirstL; + if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1)) + return false; + bool CanNegateR; + bool MustBeFirstR; + if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1)) + return false; + + if (MustBeFirstL && MustBeFirstR) + return false; + + if (IsOR) { + // For an OR expression we need to be able to naturally negate at least + // one side or we cannot do the transformation at all. + if (!CanNegateL && !CanNegateR) + return false; + // If we the result of the OR will be negated and we can naturally negate + // the leaves, then this sub-tree as a whole negates naturally. + CanNegate = WillNegate && CanNegateL && CanNegateR; + // If we cannot naturally negate the whole sub-tree, then this must be + // emitted first. + MustBeFirst = !CanNegate; + } else { + assert(Opcode == TargetOpcode::G_AND && "Must be G_AND"); + // We cannot naturally negate an AND operation. + CanNegate = false; + MustBeFirst = MustBeFirstL || MustBeFirstR; + } + return true; + } + return false; +} + +MachineInstr *AArch64InstructionSelector::emitConditionalComparison( + Register LHS, Register RHS, CmpInst::Predicate CC, + AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, + MachineIRBuilder &MIB) const { + // TODO: emit CMN as an optimization. + auto &MRI = *MIB.getMRI(); + LLT OpTy = MRI.getType(LHS); + assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64); + unsigned CCmpOpc; + if (CmpInst::isIntPredicate(CC)) { + CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr; + } else { + switch (OpTy.getSizeInBits()) { + case 16: + CCmpOpc = AArch64::FCCMPHrr; + break; + case 32: + CCmpOpc = AArch64::FCCMPSrr; + break; + case 64: + CCmpOpc = AArch64::FCCMPDrr; + break; + default: + return nullptr; + } + } + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); + auto CCmp = + MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate); + constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI); + return &*CCmp; +} + +MachineInstr *AArch64InstructionSelector::emitConjunctionRec( + Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp, + AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const { + // We're at a tree leaf, produce a conditional comparison operation. + auto &MRI = *MIB.getMRI(); + MachineInstr *ValDef = MRI.getVRegDef(Val); + unsigned Opcode = ValDef->getOpcode(); + if (Opcode == TargetOpcode::G_TRUNC) { + // Look through a trunc. + Val = ValDef->getOperand(1).getReg(); + ValDef = MRI.getVRegDef(Val); + Opcode = ValDef->getOpcode(); + } + if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { + Register LHS = Cmp->getLHSReg(); + Register RHS = Cmp->getRHSReg(); + CmpInst::Predicate CC = Cmp->getCond(); + if (Negate) + CC = CmpInst::getInversePredicate(CC); + if (isa<GICmp>(Cmp)) { + OutCC = changeICMPPredToAArch64CC(CC); + } else { + // Handle special FP cases. + AArch64CC::CondCode ExtraCC; + changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); + // Some floating point conditions can't be tested with a single condition + // code. Construct an additional comparison in this case. + if (ExtraCC != AArch64CC::AL) { + MachineInstr *ExtraCmp; + if (!CCOp) + ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC); + else + ExtraCmp = + emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB); + CCOp = ExtraCmp->getOperand(0).getReg(); + Predicate = ExtraCC; + } + } + + // Produce a normal comparison if we are first in the chain + if (!CCOp) { + auto Dst = MRI.cloneVirtualRegister(LHS); + if (isa<GICmp>(Cmp)) + return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB); + return emitFPCompare(Cmp->getOperand(2).getReg(), + Cmp->getOperand(3).getReg(), MIB); + } + // Otherwise produce a ccmp. + return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB); + } + assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree"); + + bool IsOR = Opcode == TargetOpcode::G_OR; + + Register LHS = ValDef->getOperand(1).getReg(); + bool CanNegateL; + bool MustBeFirstL; + bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI); + assert(ValidL && "Valid conjunction/disjunction tree"); + (void)ValidL; + + Register RHS = ValDef->getOperand(2).getReg(); + bool CanNegateR; + bool MustBeFirstR; + bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI); + assert(ValidR && "Valid conjunction/disjunction tree"); + (void)ValidR; + + // Swap sub-tree that must come first to the right side. + if (MustBeFirstL) { + assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); + std::swap(LHS, RHS); + std::swap(CanNegateL, CanNegateR); + std::swap(MustBeFirstL, MustBeFirstR); + } + + bool NegateR; + bool NegateAfterR; + bool NegateL; + bool NegateAfterAll; + if (Opcode == TargetOpcode::G_OR) { + // Swap the sub-tree that we can negate naturally to the left. + if (!CanNegateL) { + assert(CanNegateR && "at least one side must be negatable"); + assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); + assert(!Negate); + std::swap(LHS, RHS); + NegateR = false; + NegateAfterR = true; + } else { + // Negate the left sub-tree if possible, otherwise negate the result. + NegateR = CanNegateR; + NegateAfterR = !CanNegateR; + } + NegateL = true; + NegateAfterAll = !Negate; + } else { + assert(Opcode == TargetOpcode::G_AND && + "Valid conjunction/disjunction tree"); + assert(!Negate && "Valid conjunction/disjunction tree"); + + NegateL = false; + NegateR = false; + NegateAfterR = false; + NegateAfterAll = false; + } + + // Emit sub-trees. + AArch64CC::CondCode RHSCC; + MachineInstr *CmpR = + emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB); + if (NegateAfterR) + RHSCC = AArch64CC::getInvertedCondCode(RHSCC); + MachineInstr *CmpL = emitConjunctionRec( + LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB); + if (NegateAfterAll) + OutCC = AArch64CC::getInvertedCondCode(OutCC); + return CmpL; +} + +MachineInstr *AArch64InstructionSelector::emitConjunction( + Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const { + bool DummyCanNegate; + bool DummyMustBeFirst; + if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false, + *MIB.getMRI())) + return nullptr; + return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB); +} + +bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI, + MachineInstr &CondMI) { + AArch64CC::CondCode AArch64CC; + MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB); + if (!ConjMI) + return false; + + emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB); + SelI.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { MachineRegisterInfo &MRI = *MIB.getMRI(); // We want to recognize this pattern: // @@ -4750,12 +5054,12 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) { } // Is the condition defined by a compare? - if (!CondDef) - return false; - unsigned CondOpc = CondDef->getOpcode(); - if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) + if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) { + if (tryOptSelectConjunction(I, *CondDef)) + return true; return false; + } AArch64CC::CondCode CondCode; if (CondOpc == TargetOpcode::G_ICMP) { @@ -5081,7 +5385,7 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, // the original size to get the result we want. Register DemoteVec = InsMI->getOperand(0).getReg(); const TargetRegisterClass *RC = - getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize); + getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI)); if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); return false; @@ -5198,12 +5502,11 @@ bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg( })) return false; unsigned SubReg; - const TargetRegisterClass *EltRC = - getMinClassForRegBank(EltRB, EltTy.getSizeInBits()); + const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB); if (!EltRC) return false; const TargetRegisterClass *DstRC = - getMinClassForRegBank(DstRB, MRI.getType(Dst).getSizeInBits()); + getRegClassForTypeOnBank(MRI.getType(Dst), DstRB); if (!DstRC) return false; if (!getSubRegForClass(EltRC, TRI, SubReg)) @@ -5261,7 +5564,7 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I, if (DstSize < 128) { // Force this to be FPR using the destination vector. const TargetRegisterClass *RC = - getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize); + getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI)); if (!RC) return false; if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { @@ -5528,7 +5831,7 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, uint64_t Key = I.getOperand(3).getImm(); Register DiscReg = I.getOperand(4).getReg(); auto DiscVal = getIConstantVRegVal(DiscReg, MRI); - bool IsDiscZero = DiscVal.hasValue() && DiscVal->isNullValue(); + bool IsDiscZero = DiscVal && DiscVal->isNullValue(); if (Key > 3) return false; @@ -5777,8 +6080,6 @@ AArch64InstructionSelector::selectExtendedSHL( MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); - if (!OffsetInst) - return None; unsigned OffsetOpc = OffsetInst->getOpcode(); bool LookedThroughZExt = false; @@ -5932,7 +6233,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset( // We need a GEP. MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); - if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD) + if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD) return None; // If this is used more than once, let's not bother folding. @@ -6112,14 +6413,12 @@ AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, return None; MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); - if (!RootDef) - return None; MachineOperand &OffImm = RootDef->getOperand(2); if (!OffImm.isReg()) return None; MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); - if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT) + if (RHS->getOpcode() != TargetOpcode::G_CONSTANT) return None; int64_t RHSC; MachineOperand &RHSOp1 = RHS->getOperand(1); @@ -6187,9 +6486,6 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, return None; MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); - if (!RootDef) - return None; - if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { return {{ [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, @@ -6210,27 +6506,26 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, MachineOperand &RHS = RootDef->getOperand(2); MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); - if (LHSDef && RHSDef) { - int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); - unsigned Scale = Log2_32(Size); - if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { - if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, - }}; + int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { + if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, + [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, }}; - } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, + }}; } } // Before falling back to our general case, check if the unscaled // instructions can handle this. If so, that's preferable. - if (selectAddrModeUnscaled(Root, Size).hasValue()) + if (selectAddrModeUnscaled(Root, Size)) return None; return {{ @@ -6269,8 +6564,6 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root, // Check if the operand is defined by an instruction which corresponds to // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); - if (!ShiftInst) - return None; AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); if (ShType == AArch64_AM::InvalidShiftExtend) return None; @@ -6425,7 +6718,7 @@ AArch64InstructionSelector::selectArithExtendedRegister( // to. if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); - if (ExtInst && isDef32(*ExtInst)) + if (isDef32(*ExtInst)) return None; } } @@ -6450,7 +6743,7 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, Optional<int64_t> CstVal = getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); assert(CstVal && "Expected constant value"); - MIB.addImm(CstVal.getValue()); + MIB.addImm(*CstVal); } void AArch64InstructionSelector::renderLogicalImm32( @@ -6498,6 +6791,17 @@ void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB, AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF())); } +void AArch64InstructionSelector::renderFPImm32SIMDModImmType4( + MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { + assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 && + "Expected G_FCONSTANT"); + MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1) + .getFPImm() + ->getValueAPF() + .bitcastToAPInt() + .getZExtValue())); +} + bool AArch64InstructionSelector::isLoadStoreOfNumBytes( const MachineInstr &MI, unsigned NumBytes) const { if (!MI.mayLoadOrStore()) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index e9df7e001d38..74ec9373ce9e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -169,7 +169,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .scalarize(0); getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) - .lowerFor({s1, s8, s16, s32, s64, v2s64, v4s32, v2s32}) + .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32}) .widenScalarOrEltToNextPow2(0) .clampScalarOrElt(0, s32, s64) .clampNumElements(0, v2s32, v4s32) @@ -180,7 +180,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SMULO, G_UMULO}) .widenScalarToNextPow2(0, /*Min = */ 32) .clampScalar(0, s32, s64) - .lowerIf(typeIs(1, s1)); + .lower(); getActionDefinitionsBuilder({G_SMULH, G_UMULH}) .legalFor({s64, v8s16, v16s8, v4s32}) @@ -308,7 +308,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // These extends are also legal .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}}) .widenScalarToNextPow2(0, /* MinSize = */8) - .lowerIfMemSizeNotPow2() + .lowerIfMemSizeNotByteSizePow2() .clampScalar(0, s8, s64) .narrowScalarIf([=](const LegalityQuery &Query) { // Clamp extending load results to 32-bits. @@ -317,10 +317,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) Query.Types[0].getSizeInBits() > 32; }, changeTo(0, s32)) - // Lower any any-extending loads left into G_ANYEXT and G_LOAD - .lowerIf([=](const LegalityQuery &Query) { - return Query.Types[0] != Query.MMODescrs[0].MemoryTy; - }) .clampMaxNumElements(0, s8, 16) .clampMaxNumElements(0, s16, 8) .clampMaxNumElements(0, s32, 4) @@ -536,7 +532,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) .lowerIf( - all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0))); + all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) .customIf([](const LegalityQuery &Query) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index 3dec980a819a..ba206bac68d1 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -20,11 +20,13 @@ //===----------------------------------------------------------------------===// #include "AArch64TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" @@ -133,7 +135,7 @@ bool matchAArch64MulConstCombine( if (!Const) return false; - const APInt ConstValue = Const->Value.sextOrSelf(Ty.getSizeInBits()); + APInt ConstValue = Const->Value.sext(Ty.getSizeInBits()); // The following code is ported from AArch64ISelLowering. // Multiplication of a power of two plus/minus one can be done more // cheaply as as shift+add/sub. For now, this is true unilaterally. If @@ -258,7 +260,7 @@ void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI, // %d(s64) = G_ZEXT %a(s32) Observer.changingInstr(MI); MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT)); - MI.RemoveOperand(2); + MI.removeOperand(2); Observer.changedInstr(MI); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 3ff67d188822..d7959a82c484 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -58,7 +58,7 @@ struct ShuffleVectorPseudo { ShuffleVectorPseudo(unsigned Opc, Register Dst, std::initializer_list<SrcOp> SrcOps) : Opc(Opc), Dst(Dst), SrcOps(SrcOps){}; - ShuffleVectorPseudo() {} + ShuffleVectorPseudo() = default; }; /// Check if a vector shuffle corresponds to a REV instruction with the diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp index cc45c6642ac5..ce6f15a799b7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp @@ -149,7 +149,7 @@ bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { "op in fcmp range: " << II); II.setDesc(TII->get(NewOpc)); - II.RemoveOperand(DeadNZCVIdx); + II.removeOperand(DeadNZCVIdx); // Changing the opcode can result in differing regclass requirements, // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp. // Constrain the regclasses, possibly introducing a copy. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index d3f4130d2ba1..275949c5ee64 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -13,6 +13,7 @@ #include "AArch64GlobalISelUtils.h" #include "AArch64TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -162,13 +163,14 @@ static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, // Check whether folding this offset is legal. It must not go out of bounds of // the referenced object to avoid violating the code model, and must be - // smaller than 2^21 because this is the largest offset expressible in all - // object formats. + // smaller than 2^20 because this is the largest offset expressible in all + // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF + // stores an immediate signed 21 bit offset.) // // This check also prevents us from folding negative offsets, which will end // up being treated in the same way as large positive ones. They could also // cause code model violations, and aren't really common enough to matter. - if (NewOffset >= (1 << 21)) + if (NewOffset >= (1 << 20)) return false; Type *T = GV->getValueType(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 515a5c63a559..f0b311289c41 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -12,20 +12,19 @@ //===----------------------------------------------------------------------===// #include "AArch64RegisterBankInfo.h" -#include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" -#include "llvm/CodeGen/GlobalISel/RegisterBank.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterBank.h" +#include "llvm/CodeGen/RegisterBankInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -42,8 +41,8 @@ using namespace llvm; -AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) - : AArch64GenRegisterBankInfo() { +AArch64RegisterBankInfo::AArch64RegisterBankInfo( + const TargetRegisterInfo &TRI) { static llvm::once_flag InitializeRegisterBankFlag; static auto InitializeRegisterBankOnce = [&]() { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h index 2d76e48d7df2..01ef0bd92d50 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h @@ -13,7 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H #define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/RegisterBankInfo.h" #define GET_REGBANK_DECLARATIONS #include "AArch64GenRegisterBank.inc" diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index dbb8e85713cb..e4b547e17f64 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -22,10 +22,10 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -470,7 +470,7 @@ bool AArch64AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, // We are properly aligned, so write NOPs as requested. Count /= 4; for (uint64_t i = 0; i != Count; ++i) - support::endian::write<uint32_t>(OS, 0xd503201f, Endian); + OS.write("\x1f\x20\x03\xd5", 4); return true; } @@ -592,17 +592,18 @@ public: if (XReg != AArch64::FP) return CU::UNWIND_ARM64_MODE_DWARF; - assert(XReg == AArch64::FP && "Invalid frame pointer!"); - assert(i + 2 < e && "Insufficient CFI instructions to define a frame!"); + if (i + 2 >= e) + return CU::UNWIND_ARM64_MODE_DWARF; const MCCFIInstruction &LRPush = Instrs[++i]; - assert(LRPush.getOperation() == MCCFIInstruction::OpOffset && - "Link register not pushed!"); + if (LRPush.getOperation() != MCCFIInstruction::OpOffset) + return CU::UNWIND_ARM64_MODE_DWARF; const MCCFIInstruction &FPPush = Instrs[++i]; - assert(FPPush.getOperation() == MCCFIInstruction::OpOffset && - "Frame pointer not pushed!"); + if (FPPush.getOperation() != MCCFIInstruction::OpOffset) + return CU::UNWIND_ARM64_MODE_DWARF; - assert(FPPush.getOffset() + 8 == LRPush.getOffset()); + if (FPPush.getOffset() + 8 != LRPush.getOffset()) + return CU::UNWIND_ARM64_MODE_DWARF; CurOffset = FPPush.getOffset(); unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true); @@ -611,8 +612,8 @@ public: LRReg = getXRegFromWReg(LRReg); FPReg = getXRegFromWReg(FPReg); - assert(LRReg == AArch64::LR && FPReg == AArch64::FP && - "Pushing invalid registers for frame!"); + if (LRReg != AArch64::LR || FPReg != AArch64::FP) + return CU::UNWIND_ARM64_MODE_DWARF; // Indicate that the function has a frame. CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME; @@ -620,7 +621,8 @@ public: break; } case MCCFIInstruction::OpDefCfaOffset: { - assert(StackSize == 0 && "We already have the CFA offset!"); + if (StackSize != 0) + return CU::UNWIND_ARM64_MODE_DWARF; StackSize = std::abs(Inst.getOffset()); break; } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 78c0e90b1384..46edb12959d2 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -254,6 +254,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { } void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) { + getStreamer().getAssembler().registerSymbol(*Symbol); cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index ee0870d9ef7a..5d2ba7ef02c0 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1340,11 +1340,6 @@ void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI, O << getRegisterName(Even) << ", " << getRegisterName(Odd); } -static const unsigned MatrixZADRegisterTable[] = { - AArch64::ZAD0, AArch64::ZAD1, AArch64::ZAD2, AArch64::ZAD3, - AArch64::ZAD4, AArch64::ZAD5, AArch64::ZAD6, AArch64::ZAD7 -}; - void AArch64InstPrinter::printMatrixTileList(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1362,7 +1357,7 @@ void AArch64InstPrinter::printMatrixTileList(const MCInst *MI, unsigned OpNum, unsigned Reg = RegMask & (1 << I); if (Reg == 0) continue; - O << getRegisterName(MatrixZADRegisterTable[I]); + O << getRegisterName(AArch64::ZAD0 + I); if (Printed + 1 != NumRegs) O << ", "; ++Printed; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index ad97071434df..2901e5c0fe4d 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -16,6 +16,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixup.h" @@ -677,7 +678,6 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison( #include "AArch64GenMCCodeEmitter.inc" MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx) { return new AArch64MCCodeEmitter(MCII, Ctx); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 844bd6bbada9..cb39c2a11487 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index c1186ae804d2..34e3b2cf58e4 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -52,21 +52,14 @@ static MCSubtargetInfo * createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { if (CPU.empty()) { CPU = "generic"; + if (FS.empty()) + FS = "+v8a"; if (TT.isArm64e()) CPU = "apple-a12"; } - // Most of the NEON instruction set isn't supported in streaming mode on SME - // targets, disable NEON unless explicitly requested. - bool RequestedNEON = FS.contains("neon"); - bool RequestedStreamingSVE = FS.contains("streaming-sve"); - MCSubtargetInfo *STI = - createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); - if (RequestedStreamingSVE && !RequestedNEON && - STI->hasFeature(AArch64::FeatureNEON)) - STI->ToggleFeature(AArch64::FeatureNEON); - return STI; + return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); } void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { @@ -243,6 +236,31 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg)); } +bool AArch64_MC::isQForm(const MCInst &MI, const MCInstrInfo *MCII) { + const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID]; + return llvm::any_of(MI, [&](const MCOperand &Op) { + return Op.isReg() && FPR128.contains(Op.getReg()); + }); +} + +bool AArch64_MC::isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII) { + const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID]; + const auto &FPR64 = AArch64MCRegisterClasses[AArch64::FPR64RegClassID]; + const auto &FPR32 = AArch64MCRegisterClasses[AArch64::FPR32RegClassID]; + const auto &FPR16 = AArch64MCRegisterClasses[AArch64::FPR16RegClassID]; + const auto &FPR8 = AArch64MCRegisterClasses[AArch64::FPR8RegClassID]; + + auto IsFPR = [&](const MCOperand &Op) { + if (!Op.isReg()) + return false; + auto Reg = Op.getReg(); + return FPR128.contains(Reg) || FPR64.contains(Reg) || FPR32.contains(Reg) || + FPR16.contains(Reg) || FPR8.contains(Reg); + }; + + return llvm::any_of(MI, IsFPR); +} + static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) { MCRegisterInfo *X = new MCRegisterInfo(); InitAArch64MCRegisterInfo(X, AArch64::LR); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 66cb7a37a958..049c49796dc6 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H +#include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/DataTypes.h" #include <memory> @@ -22,6 +23,7 @@ class formatted_raw_ostream; class MCAsmBackend; class MCCodeEmitter; class MCContext; +class MCInst; class MCInstrInfo; class MCInstPrinter; class MCRegisterInfo; @@ -33,7 +35,6 @@ class MCTargetStreamer; class Target; MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx); MCAsmBackend *createAArch64leAsmBackend(const Target &T, const MCSubtargetInfo &STI, @@ -60,8 +61,16 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, namespace AArch64_MC { void initLLVMToCVRegMapping(MCRegisterInfo *MRI); +bool isQForm(const MCInst &MI, const MCInstrInfo *MCII); +bool isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII); } +namespace AArch64 { +enum OperandType { + OPERAND_IMPLICIT_IMM_0 = MCOI::OPERAND_FIRST_TARGET, +}; +} // namespace AArch64 + } // End llvm namespace // Defines symbolic names for AArch64 registers. This defines a mapping from diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 92552c3d41d5..1a8071ac1b33 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -76,7 +76,7 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) { return; } MCSection *Cur = OutStreamer.getCurrentSectionOnly(); - OutStreamer.SwitchSection(Nt); + OutStreamer.switchSection(Nt); // Emit the note header. OutStreamer.emitValueToAlignment(Align(8).value()); @@ -92,7 +92,7 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) { OutStreamer.emitIntValue(0, 4); // pad OutStreamer.endSection(Nt); - OutStreamer.SwitchSection(Cur); + OutStreamer.switchSection(Cur); } void AArch64TargetStreamer::emitInst(uint32_t Inst) { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index 0072af4cc16e..46ffa50b3e6e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <cassert> diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index b688165d3a7b..820d940c1ed2 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -8,6 +8,7 @@ #include "AArch64WinCOFFStreamer.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCWin64EH.h" @@ -26,14 +27,14 @@ public: std::unique_ptr<MCObjectWriter> OW) : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {} - void EmitWinEHHandlerData(SMLoc Loc) override; - void EmitWindowsUnwindTables() override; - void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; + void emitWinEHHandlerData(SMLoc Loc) override; + void emitWindowsUnwindTables() override; + void emitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; void finishImpl() override; }; -void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { - MCStreamer::EmitWinEHHandlerData(Loc); +void AArch64WinCOFFStreamer::emitWinEHHandlerData(SMLoc Loc) { + MCStreamer::emitWinEHHandlerData(Loc); // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! @@ -41,11 +42,11 @@ void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { /* HandlerData = */ true); } -void AArch64WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) { +void AArch64WinCOFFStreamer::emitWindowsUnwindTables(WinEH::FrameInfo *Frame) { EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false); } -void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() { +void AArch64WinCOFFStreamer::emitWindowsUnwindTables() { if (!getNumWinFrameInfos()) return; EHStreamer.Emit(*this); @@ -53,7 +54,7 @@ void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() { void AArch64WinCOFFStreamer::finishImpl() { emitFrames(nullptr); - EmitWindowsUnwindTables(); + emitWindowsUnwindTables(); MCWinCOFFStreamer::finishImpl(); } @@ -71,10 +72,9 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinUnwindCode(unsigned UnwindCode, WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc()); if (!CurFrame) return; - MCSymbol *Label = S.emitCFILabel(); - auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset); + auto Inst = WinEH::Instruction(UnwindCode, /*Label=*/nullptr, Reg, Offset); if (InEpilogCFI) - CurFrame->EpilogMap[CurrentEpilog].push_back(Inst); + CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst); else CurFrame->Instructions.push_back(Inst); } @@ -176,7 +176,8 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIPrologEnd() { MCSymbol *Label = S.emitCFILabel(); CurFrame->PrologEnd = Label; - WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0); + WinEH::Instruction Inst = + WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0); auto it = CurFrame->Instructions.begin(); CurFrame->Instructions.insert(it, Inst); } @@ -198,9 +199,9 @@ void AArch64TargetWinCOFFStreamer::emitARM64WinCFIEpilogEnd() { return; InEpilogCFI = false; - MCSymbol *Label = S.emitCFILabel(); - WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0); - CurFrame->EpilogMap[CurrentEpilog].push_back(Inst); + WinEH::Instruction Inst = + WinEH::Instruction(Win64EH::UOP_End, /*Label=*/nullptr, -1, 0); + CurFrame->EpilogMap[CurrentEpilog].Instructions.push_back(Inst); CurrentEpilog = nullptr; } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 41f2cead4cf8..2744e81f99f1 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -10,14 +10,36 @@ // //===----------------------------------------------------------------------===// +def imm_to_tile8 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAB0>", []>; +def imm_to_tile16 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAH0>", []>; +def imm_to_tile32 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAS0>", []>; +def imm_to_tile64 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAD0>", []>; +def imm_to_tile128 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAQ0>", []>; + +def tileslice8 : ComplexPattern<i32 , 2, "SelectSMETileSlice<4>", []>; +def tileslice16 : ComplexPattern<i32 , 2, "SelectSMETileSlice<3>", []>; +def tileslice32 : ComplexPattern<i32 , 2, "SelectSMETileSlice<2>", []>; +def tileslice64 : ComplexPattern<i32 , 2, "SelectSMETileSlice<1>", []>; +def tileslice128 : ComplexPattern<i32 , 2, "SelectSMETileSlice<0>", []>; // nop + +def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>; + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// +class sme_outer_product_pseudo<ZPRRegOp zpr_ty> + : Pseudo<(outs), (ins i64imm:$tile, PPR3bAny:$pn, PPR3bAny:$pm, + zpr_ty:$zn, zpr_ty:$zm), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; +} + class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty, ZPRRegOp zpr_ty, string mnemonic> : I<(outs za_ty:$ZAda), - (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), + (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm", "", []>, Sched<[]> { @@ -34,26 +56,42 @@ class sme_fp_outer_product_inst<bit S, bit sz, MatrixTileOperand za_ty, let Inst{9-5} = Zn; let Inst{4} = S; let Inst{3} = 0b0; + + let Constraints = "$ZAda = $_ZAda"; } -class sme_outer_product_fp32<bit S, string mnemonic> - : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> { - bits<2> ZAda; - let Inst{1-0} = ZAda; - let Inst{2} = 0b0; +multiclass sme_outer_product_fp32<bit S, string mnemonic, SDPatternOperator op> { + def NAME : sme_fp_outer_product_inst<S, 0b0, TileOp32, ZPR32, mnemonic> { + bits<2> ZAda; + let Inst{1-0} = ZAda; + let Inst{2} = 0b0; + } + + def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR32>; + + def : Pat<(op imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm), + (nxv4f32 ZPR32:$zn), (nxv4f32 ZPR32:$zm)), + (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>; } -class sme_outer_product_fp64<bit S, string mnemonic> - : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> { - bits<3> ZAda; - let Inst{2-0} = ZAda; +multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op> { + def NAME : sme_fp_outer_product_inst<S, 0b1, TileOp64, ZPR64, mnemonic> { + bits<3> ZAda; + let Inst{2-0} = ZAda; + } + + def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR64>; + + def : Pat<(op imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm), + (nxv2f64 ZPR64:$zn), (nxv2f64 ZPR64:$zm)), + (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>; } class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz, MatrixTileOperand za_ty, ZPRRegOp zpr_ty, string mnemonic> : I<(outs za_ty:$ZAda), - (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), + (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm", "", []>, Sched<[]> { @@ -72,26 +110,44 @@ class sme_int_outer_product_inst<bit u0, bit u1, bit S, bit sz, let Inst{9-5} = Zn; let Inst{4} = S; let Inst{3} = 0b0; + + let Constraints = "$ZAda = $_ZAda"; } -class sme_int_outer_product_i32<bits<3> opc, string mnemonic> - : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32, ZPR8, - mnemonic> { - bits<2> ZAda; - let Inst{1-0} = ZAda; - let Inst{2} = 0b0; +multiclass sme_int_outer_product_i32<bits<3> opc, string mnemonic, + SDPatternOperator op> { + def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b0, TileOp32, + ZPR8, mnemonic> { + bits<2> ZAda; + let Inst{1-0} = ZAda; + let Inst{2} = 0b0; + } + + def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR8>; + + def : Pat<(op imm0_3:$tile, (nxv16i1 PPR3bAny:$pn), (nxv16i1 PPR3bAny:$pm), + (nxv16i8 ZPR8:$zn), (nxv16i8 ZPR8:$zm)), + (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>; } -class sme_int_outer_product_i64<bits<3> opc, string mnemonic> - : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64, ZPR16, - mnemonic> { - bits<3> ZAda; - let Inst{2-0} = ZAda; +multiclass sme_int_outer_product_i64<bits<3> opc, string mnemonic, + SDPatternOperator op> { + def NAME : sme_int_outer_product_inst<opc{2}, opc{1}, opc{0}, 0b1, TileOp64, + ZPR16, mnemonic> { + bits<3> ZAda; + let Inst{2-0} = ZAda; + } + + def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>; + + def : Pat<(op imm0_7:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm), + (nxv8i16 ZPR16:$zn), (nxv8i16 ZPR16:$zm)), + (!cast<Instruction>(NAME # _PSEUDO) imm0_7:$tile, $pn, $pm, $zn, $zm)>; } class sme_outer_product_widening_inst<bit op, bit S, string mnemonic> : I<(outs TileOp32:$ZAda), - (ins PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm), + (ins TileOp32:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm), mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm", "", []>, Sched<[]> { @@ -109,14 +165,28 @@ class sme_outer_product_widening_inst<bit op, bit S, string mnemonic> let Inst{4} = S; let Inst{3-2} = 0b00; let Inst{1-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; } -multiclass sme_bf16_outer_product<bit S, string mnemonic> { - def : sme_outer_product_widening_inst<0b0, S, mnemonic>; +multiclass sme_bf16_outer_product<bit S, string mnemonic, SDPatternOperator op> { + def NAME : sme_outer_product_widening_inst<0b0, S, mnemonic>; + + def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>; + + def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm), + (nxv8bf16 ZPR16:$zn), (nxv8bf16 ZPR16:$zm)), + (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>; } -multiclass sme_f16_outer_product<bit S, string mnemonic> { - def : sme_outer_product_widening_inst<0b1, S, mnemonic>; +multiclass sme_f16_outer_product<bit S, string mnemonic, SDPatternOperator op> { + def NAME : sme_outer_product_widening_inst<0b1, S, mnemonic>; + + def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR16>; + + def : Pat<(op imm0_3:$tile, (nxv8i1 PPR3bAny:$pn), (nxv8i1 PPR3bAny:$pm), + (nxv8f16 ZPR16:$zn), (nxv8f16 ZPR16:$zm)), + (!cast<Instruction>(NAME # _PSEUDO) imm0_3:$tile, $pn, $pm, $zn, $zm)>; } //===----------------------------------------------------------------------===// @@ -126,7 +196,7 @@ multiclass sme_f16_outer_product<bit S, string mnemonic> { class sme_add_vector_to_tile_inst<bit op, bit V, MatrixTileOperand tile_ty, ZPRRegOp zpr_ty, string mnemonic> : I<(outs tile_ty:$ZAda), - (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn), + (ins tile_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn), mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn", "", []>, Sched<[]> { bits<3> Pm; @@ -140,6 +210,8 @@ class sme_add_vector_to_tile_inst<bit op, bit V, MatrixTileOperand tile_ty, let Inst{12-10} = Pn; let Inst{9-5} = Zn; let Inst{4-3} = 0b00; + + let Constraints = "$ZAda = $_ZAda"; } class sme_add_vector_to_tile_u32<bit V, string mnemonic> @@ -225,6 +297,33 @@ multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> { defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">; } +multiclass sme_mem_ld_ss_patterns<Instruction Inst, SDPatternOperator Load, + Operand tile_ty, Operand offset_ty, + ComplexPattern addr, + ComplexPattern tileslice> { + // base, tileslice + def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile, + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))), + (Inst tile_ty:$tile, $idx, $imm, $pg, $base, XZR)>; + + // reg + reg, tileslice + let AddedComplexity = 1 in { + def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset), + tile_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, + offset_ty:$imm))), + (Inst tile_ty:$tile, $idx, $imm, $pg, $base, $offset)>; + } +} + +class sme_load_pseudo + : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx, + i64imm:$imm, PPR3bAny:$pg, GPR64sp:$base, GPR64:$offset), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + let mayLoad = 1; +} + multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> { def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b", !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -264,6 +363,40 @@ multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> { } defm : sme_mem_ld_ss_aliases<NAME, is_col>; + + // Pseudo instructions for lowering intrinsics, using immediates instead of + // tile registers. + def _PSEUDO_B : sme_load_pseudo; + def _PSEUDO_H : sme_load_pseudo; + def _PSEUDO_S : sme_load_pseudo; + def _PSEUDO_D : sme_load_pseudo; + def _PSEUDO_Q : sme_load_pseudo; + + defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_B), + !if(is_col, int_aarch64_sme_ld1b_vert, + int_aarch64_sme_ld1b_horiz), + sme_elm_idx0_0, imm0_15, am_sve_regreg_lsl0, + tileslice8>; + defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_H), + !if(is_col, int_aarch64_sme_ld1h_vert, + int_aarch64_sme_ld1h_horiz), + imm0_1, imm0_7, am_sve_regreg_lsl1, + tileslice16>; + defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_S), + !if(is_col, int_aarch64_sme_ld1w_vert, + int_aarch64_sme_ld1w_horiz), + imm0_3, imm0_3, am_sve_regreg_lsl2, + tileslice32>; + defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_D), + !if(is_col, int_aarch64_sme_ld1d_vert, + int_aarch64_sme_ld1d_horiz), + imm0_7, imm0_1, am_sve_regreg_lsl3, + tileslice64>; + defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + !if(is_col, int_aarch64_sme_ld1q_vert, + int_aarch64_sme_ld1q_horiz), + imm0_15, sme_elm_idx0_0, am_sve_regreg_lsl4, + tileslice128>; } multiclass sme_mem_ld_ss<string mnemonic> { @@ -310,6 +443,25 @@ multiclass sme_mem_st_ss_aliases<string inst, bit is_col> { defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>; } +multiclass sme_mem_st_ss_patterns<Instruction Inst, SDPatternOperator Store, + Operand offset_ty, + ComplexPattern imm2tile, + ComplexPattern addr, + ComplexPattern tileslice> { + // base, tileslice + def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile), + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))), + (Inst $tile, $idx, $imm, $pg, $base, XZR)>; + + // reg + reg, tileslice + let AddedComplexity = 1 in { + def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset), + (imm2tile untyped:$tile), + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))), + (Inst $tile, $idx, $imm, $pg, $base, $offset)>; + } +} + multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> { def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b", !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -349,6 +501,32 @@ multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> { } defm : sme_mem_st_ss_aliases<NAME, is_col>; + + defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _B), + !if(is_col, int_aarch64_sme_st1b_vert, + int_aarch64_sme_st1b_horiz), + imm0_15, imm_to_tile8, am_sve_regreg_lsl0, + tileslice8>; + defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _H), + !if(is_col, int_aarch64_sme_st1h_vert, + int_aarch64_sme_st1h_horiz), + imm0_7, imm_to_tile16, am_sve_regreg_lsl1, + tileslice16>; + defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _S), + !if(is_col, int_aarch64_sme_st1w_vert, + int_aarch64_sme_st1w_horiz), + imm0_3, imm_to_tile32, am_sve_regreg_lsl2, + tileslice32>; + defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _D), + !if(is_col, int_aarch64_sme_st1d_vert, + int_aarch64_sme_st1d_horiz), + imm0_1, imm_to_tile64, am_sve_regreg_lsl3, + tileslice64>; + defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _Q), + !if(is_col, int_aarch64_sme_st1q_vert, + int_aarch64_sme_st1q_horiz), + sme_elm_idx0_0, imm_to_tile128, + am_sve_regreg_lsl4, tileslice128>; } multiclass sme_mem_st_ss<string mnemonic> { @@ -360,7 +538,7 @@ multiclass sme_mem_st_ss<string mnemonic> { // SME Save and Restore Array //===----------------------------------------------------------------------===// -class sme_spill_fill_inst<bit isStore, dag outs, dag ins, string opcodestr> +class sme_spill_fill_base<bit isStore, dag outs, dag ins, string opcodestr> : I<outs, ins, opcodestr, "\t$ZAt[$Rv, $imm4], [$Rn, $offset, mul vl]", "", []>, Sched<[]> { @@ -375,33 +553,61 @@ class sme_spill_fill_inst<bit isStore, dag outs, dag ins, string opcodestr> let Inst{9-5} = Rn; let Inst{4} = 0b0; let Inst{3-0} = imm4; - - let mayLoad = !not(isStore); - let mayStore = isStore; } -multiclass sme_spill_fill<bit isStore, dag outs, dag ins, string opcodestr> { - def NAME : sme_spill_fill_inst<isStore, outs, ins, opcodestr>; - +let mayStore = 1 in +class sme_spill_inst<string opcodestr> + : sme_spill_fill_base<0b1, (outs), + (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, + sme_elm_idx0_15:$imm4, GPR64sp:$Rn, + imm0_15:$offset), + opcodestr>; +let mayLoad = 1 in +class sme_fill_inst<string opcodestr> + : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt), + (ins MatrixIndexGPR32Op12_15:$Rv, + sme_elm_idx0_15:$imm4, GPR64sp:$Rn, + imm0_15:$offset), + opcodestr>; +multiclass sme_spill<string opcodestr> { + def NAME : sme_spill_inst<opcodestr>; def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]", (!cast<Instruction>(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; -} - -multiclass sme_spill<string opcodestr> { - defm NAME : sme_spill_fill<0b1, (outs), - (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, - sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), - opcodestr>; + // base + def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), + (!cast<Instruction>(NAME) ZA, $idx, 0, $base, 0)>; + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, + (am_sme_indexed_b4 GPR64sp:$base, imm0_15:$imm4)), + (!cast<Instruction>(NAME) ZA, $idx, 0, $base, $imm4)>; + } } multiclass sme_fill<string opcodestr> { - defm NAME : sme_spill_fill<0b0, (outs MatrixOp:$ZAt), - (ins MatrixIndexGPR32Op12_15:$Rv, - sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), - opcodestr>; + def NAME : sme_fill_inst<opcodestr>; + def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]", + (!cast<Instruction>(NAME) MatrixOp:$ZAt, + MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; + def NAME # _PSEUDO + : Pseudo<(outs), + (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4, + GPR64sp:$base), []>, + Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + let mayLoad = 1; + } + // base + def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), + (!cast<Instruction>(NAME # _PSEUDO) $idx, 0, $base)>; + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, + (am_sme_indexed_b4 GPR64sp:$base, imm0_15:$imm4)), + (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm4, $base)>; + } } //===----------------------------------------------------------------------===// @@ -429,8 +635,12 @@ class sme_vector_to_tile_inst<bit Q, bits<2> sz, MatrixTileVectorOperand tile_ty bit is_col, Operand imm_ty, ZPRRegOp zpr_ty, string mnemonic> : sme_vector_to_tile_base<Q, is_col, sz, (outs tile_ty:$ZAd), - (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), - mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">; + (ins tile_ty:$_ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), + mnemonic, "\t$ZAd[$Rv, $imm], $Pg/m, $Zn">{ + + let Constraints = "$ZAd = $_ZAd"; +} + multiclass sme_vector_to_tile_aliases<Instruction inst, MatrixTileVectorOperand tile_ty, @@ -439,6 +649,30 @@ multiclass sme_vector_to_tile_aliases<Instruction inst, (inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>; } +multiclass sme_vector_to_tile_patterns<Instruction inst, ValueType zpr_vt, + ValueType ppr_vt, Operand imm_ty, + Operand offset_ty, + SDPatternOperator op, + ComplexPattern tileslice> { + def : Pat<(op imm_ty:$tile, MatrixIndexGPR32Op12_15:$idx, + (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)), + (inst imm_ty:$tile, $idx, 0, $pg, $zn)>; + let AddedComplexity = 1 in { + def : Pat<(op imm_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, + offset_ty:$imm)), + (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)), + (inst imm_ty:$tile, $idx, $imm, $pg, $zn)>; + } +} + +class sme_mova_insert_pseudo + : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx, + i64imm:$imm, PPR3bAny:$pg, ZPRAny:$zn), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; +} + multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> { def _B : sme_vector_to_tile_inst<0b0, 0b00, !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -478,6 +712,14 @@ multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> { let Inst{3-0} = ZAd; } + // Pseudo instructions for lowering intrinsics, using immediates instead of + // tile registers. + def _PSEUDO_B : sme_mova_insert_pseudo; + def _PSEUDO_H : sme_mova_insert_pseudo; + def _PSEUDO_S : sme_mova_insert_pseudo; + def _PSEUDO_D : sme_mova_insert_pseudo; + def _PSEUDO_Q : sme_mova_insert_pseudo; + defm : sme_vector_to_tile_aliases<!cast<Instruction>(NAME # _B), !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -498,6 +740,62 @@ multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> { !if(is_col, TileVectorOpV128, TileVectorOpH128), ZPR128, sme_elm_idx0_0>; + + defvar op = !if(is_col, int_aarch64_sme_write_vert, + int_aarch64_sme_write_horiz); + + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_B), + nxv16i8, nxv16i1, sme_elm_idx0_0, imm0_15, + op, tileslice8>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H), + nxv8i16, nxv8i1, sme_elm_idx0_1, imm0_7, + op, tileslice16>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H), + nxv8f16, nxv8i1, sme_elm_idx0_1, imm0_7, + op, tileslice16>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_H), + nxv8bf16, nxv8i1, sme_elm_idx0_1, imm0_7, + op, tileslice16>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_S), + nxv4i32, nxv4i1, sme_elm_idx0_3, imm0_3, + op, tileslice32>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_S), + nxv4f32, nxv4i1, sme_elm_idx0_3, imm0_3, + op, tileslice32>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_D), + nxv2i64, nxv2i1, sme_elm_idx0_7, imm0_1, + op, tileslice64>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_D), + nxv2f64, nxv2i1, sme_elm_idx0_7, imm0_1, + op, tileslice64>; + + defvar opq = !if(is_col, int_aarch64_sme_writeq_vert, + int_aarch64_sme_writeq_horiz); + + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv16i8, nxv16i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv8i16, nxv8i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv8f16, nxv8i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv8bf16, nxv8i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv4i32, nxv4i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv4f32, nxv4i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv2i64, nxv2i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns<!cast<Instruction>(NAME # _PSEUDO_Q), + nxv2f64, nxv2i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; } multiclass sme_vector_to_tile<string mnemonic> { @@ -526,8 +824,11 @@ class sme_tile_to_vector_inst<bit Q, bits<2> sz, ZPRRegOp zpr_ty, MatrixTileVectorOperand tile_ty, bit is_col, Operand imm_ty, string mnemonic> : sme_tile_to_vector_base<Q, is_col, sz, (outs zpr_ty:$Zd), - (ins PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), - mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]">; + (ins zpr_ty:$_Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), + mnemonic, "\t$Zd, $Pg/m, $ZAn[$Rv, $imm]"> { + + let Constraints = "$Zd = $_Zd"; +} multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty, MatrixTileVectorOperand tile_ty, @@ -536,6 +837,23 @@ multiclass sme_tile_to_vector_aliases<Instruction inst, ZPRRegOp zpr_ty, (inst zpr_ty:$Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), 1>; } +multiclass sme_tile_to_vector_patterns<Instruction inst, ValueType zpr_vt, + ValueType ppr_vt, Operand offset_ty, + ComplexPattern imm2tile, + ComplexPattern tileslice, + SDPatternOperator op> { + def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg), + (imm2tile untyped:$tile), MatrixIndexGPR32Op12_15:$idx)), + (inst $passthru, $pg, $tile, $idx, 0)>; + let AddedComplexity = 1 in { + def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg), + (imm2tile untyped:$tile), + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, + offset_ty:$imm)))), + (inst $passthru, $pg, $tile, $idx, $imm)>; + } +} + multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> { def _B : sme_tile_to_vector_inst<0b0, 0b00, ZPR8, !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -589,6 +907,62 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> { defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _Q), ZPR128, !if(is_col, TileVectorOpV128, TileVectorOpH128), sme_elm_idx0_0>; + + defvar op = !if(is_col, int_aarch64_sme_read_vert, + int_aarch64_sme_read_horiz); + + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _B), + nxv16i8, nxv16i1, imm0_15, + imm_to_tile8, tileslice8, op>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H), + nxv8i16, nxv8i1, imm0_7, + imm_to_tile16, tileslice16, op>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H), + nxv8f16, nxv8i1, imm0_7, + imm_to_tile16, tileslice16, op>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _H), + nxv8bf16, nxv8i1, imm0_7, + imm_to_tile16, tileslice16, op>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _S), + nxv4i32, nxv4i1, imm0_3, + imm_to_tile32, tileslice32, op>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _S), + nxv4f32, nxv4i1, imm0_3, + imm_to_tile32, tileslice32, op>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _D), + nxv2i64, nxv2i1, imm0_1, + imm_to_tile64, tileslice64, op>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _D), + nxv2f64, nxv2i1, imm0_1, + imm_to_tile64, tileslice64, op>; + + defvar opq = !if(is_col, int_aarch64_sme_readq_vert, + int_aarch64_sme_readq_horiz); + + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv16i8, nxv16i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv8i16, nxv8i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv8f16, nxv8i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv8bf16, nxv8i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv4i32, nxv4i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv4f32, nxv4i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv2i64, nxv2i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns<!cast<Instruction>(NAME # _Q), + nxv2f64, nxv2i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; } multiclass sme_tile_to_vector<string mnemonic> { @@ -600,8 +974,11 @@ multiclass sme_tile_to_vector<string mnemonic> { // SME Zero //===----------------------------------------------------------------------===// +// NOTE: This definition isn't really correct because there are outputs, i.e. +// the tile registers being zeroed. We fix this up in a custom inserter that +// marks the appropriate registers as being implicitly defined. class sme_zero_inst<string mnemonic> - : I<(outs MatrixTileList:$imm), (ins), + : I<(outs), (ins MatrixTileList:$imm), mnemonic, "\t$imm", "", []>, Sched<[]> { bits<8> imm; let Inst{31-8} = 0b110000000000100000000000; @@ -626,6 +1003,15 @@ multiclass sme_zero<string mnemonic> { def : InstAlias<"zero\t\\{za0.s,za1.s,za3.s\\}", (!cast<Instruction>(NAME) 0b10111011), 1>; def : InstAlias<"zero\t\\{za0.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11011101), 1>; def : InstAlias<"zero\t\\{za1.s,za2.s,za3.s\\}", (!cast<Instruction>(NAME) 0b11101110), 1>; + + def NAME # _PSEUDO : Pseudo<(outs), (ins i64imm:$tilelist), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + } + + def : Pat<(int_aarch64_sme_zero imm:$imm), + (!cast<Instruction>(NAME # _PSEUDO) imm:$imm)>; } //===----------------------------------------------------------------------===// @@ -651,6 +1037,15 @@ class sve2_int_perm_revd<string asm> let ElementSize = ZPR128.ElementSize; } +multiclass sve2_int_perm_revd<string asm, SDPatternOperator op> { + def NAME : sve2_int_perm_revd<asm>; + + def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME)>; + def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME)>; + def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME)>; + def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME)>; +} + class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty> : I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd), asm, "\t$Zd, $Zn, $Zm", "", []>, @@ -672,11 +1067,16 @@ class sve2_clamp<string asm, bits<2> sz, bit U, ZPRRegOp zpr_ty> let ElementSize = zpr_ty.ElementSize; } -multiclass sve2_clamp<string asm, bit U> { +multiclass sve2_clamp<string asm, bit U, SDPatternOperator op> { def _B : sve2_clamp<asm, 0b00, U, ZPR8>; def _H : sve2_clamp<asm, 0b01, U, ZPR16>; def _S : sve2_clamp<asm, 0b10, U, ZPR32>; def _D : sve2_clamp<asm, 0b11, U, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty> @@ -699,7 +1099,7 @@ class sve2_int_perm_sel_p<string asm, PPRRegOp ppr_ty, Operand imm_ty> let Inst{3-0} = Pd; } -multiclass sve2_int_perm_sel_p<string asm> { +multiclass sve2_int_perm_sel_p<string asm, SDPatternOperator op> { def _B : sve2_int_perm_sel_p<asm, PPR8, sme_elm_idx0_15> { bits<4> imm; let Inst{23-22} = imm{3-2}; @@ -723,4 +1123,32 @@ multiclass sve2_int_perm_sel_p<string asm> { let Inst{22} = 0b1; let Inst{20-18} = 0b000; } + + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, 0)>; + def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, 0)>; + def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, 0)>; + def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, 0)>; + + let AddedComplexity = 1 in { + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm), + (i32 (tileslice8 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm)))), + (!cast<Instruction>(NAME # _B) $Pn, $Pm, $idx, $imm)>; + def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm), + (i32 (tileslice16 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_7:$imm)))), + (!cast<Instruction>(NAME # _H) $Pn, $Pm, $idx, $imm)>; + def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm), + (i32 (tileslice32 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_3:$imm)))), + (!cast<Instruction>(NAME # _S) $Pn, $Pm, $idx, $imm)>; + def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm), + (i32 (tileslice64 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_1:$imm)))), + (!cast<Instruction>(NAME # _D) $Pn, $Pm, $idx, $imm)>; + } } diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 9d4bdbe5d053..3631536a32b9 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -199,6 +199,11 @@ def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", [ def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>; def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>; +def SVECpyDupImm8Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i8>", []>; +def SVECpyDupImm16Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i16>", []>; +def SVECpyDupImm32Pat : ComplexPattern<i32, 2, "SelectSVECpyDupImm<MVT::i32>", []>; +def SVECpyDupImm64Pat : ComplexPattern<i64, 2, "SelectSVECpyDupImm<MVT::i64>", []>; + def SVELogicalImm8Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8>", []>; def SVELogicalImm16Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16>", []>; def SVELogicalImm32Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32>", []>; @@ -209,14 +214,6 @@ def SVELogicalImm16NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16 def SVELogicalImm32NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32, true>", []>; def SVELogicalImm64NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64, true>", []>; -def SVE8BitLslImm32 : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>; -def SVE8BitLslImm64 : ComplexPattern<i64, 2, "SelectSVE8BitLslImm", [imm]>; -class SVE8BitLslImm<ValueType ty> { - ComplexPattern Pat = !cond( - !eq(ty, i32): SVE8BitLslImm32, - !eq(ty, i64): SVE8BitLslImm64); -} - def SVEArithUImm8Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>; def SVEArithUImm16Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>; def SVEArithUImm32Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>; @@ -234,6 +231,8 @@ def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", [] def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>; def SVEShiftImmR64 : ComplexPattern<i64, 1, "SelectSVEShiftImm<1, 64, true>", []>; +def SVEShiftSplatImmR : ComplexPattern<iAny, 1, "SelectSVEShiftSplatImmR", []>; + def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>; class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass { @@ -335,9 +334,14 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> { def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>; -let Predicates = [HasSVEorStreamingSVE] in { +let Predicates = [HasSVEorSME] in { defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>; defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>; + + def : Pat<(nxv16i1 immAllOnesV), (PTRUE_B 31)>; + def : Pat<(nxv8i1 immAllOnesV), (PTRUE_H 31)>; + def : Pat<(nxv4i1 immAllOnesV), (PTRUE_S 31)>; + def : Pat<(nxv2i1 immAllOnesV), (PTRUE_D 31)>; } //===----------------------------------------------------------------------===// @@ -370,24 +374,27 @@ class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType : Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)), (inst $Op3, $Op1, $Op2)>; -class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty, - ValueType it, ComplexPattern cpx, Instruction inst> - : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))), - (inst $Op1, i32:$imm, i32:$shift)>; +multiclass SVE_1_Op_PassthruUndef_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg, + ValueType vts, Instruction inst>{ + def : Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), (vtd undef))), + (inst (IMPLICIT_DEF), $Op1, $Op2)>; + def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, (i64 timm0_1), vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; +} class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst> - : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))), + : Pat<(vt (op (vt zprty:$Op1), (vt (splat_vector (it (cpx i32:$imm, i32:$shift)))))), (inst $Op1, i32:$imm, i32:$shift)>; class SVE_1_Op_Imm_Arith_All_Active<ValueType vt, ValueType pt, SDPatternOperator op, ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst> - : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (splat_vector (it (cpx i32:$imm)))))), (inst $Op1, i32:$imm)>; class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst> - : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))), + : Pat<(vt (op (vt zprty:$Op1), (vt (splat_vector (it (cpx i64:$imm)))))), (inst $Op1, i64:$imm)>; class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, @@ -489,20 +496,20 @@ multiclass SVE_InReg_Extend_PassthruUndef<ValueType vt, SDPatternOperator op, Va class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op, ValueType pt, ValueType it, ComplexPattern cast, Instruction inst> -: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), +: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))), (inst $Pg, $Rn, i32:$imm)>; class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op, ValueType pt, ValueType it, ComplexPattern cast, Instruction inst> -: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), +: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (splat_vector (it (cast i32:$imm)))))), (inst $Rn, i32:$imm)>; class SVE_2_Op_Fp_Imm_Pat<ValueType vt, SDPatternOperator op, ValueType pt, ValueType it, FPImmLeaf immL, int imm, Instruction inst> -: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (AArch64dup (it immL))))), +: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (splat_vector (it immL))))), (inst $Pg, $Zs1, imm)>; class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op, @@ -510,9 +517,33 @@ class SVE_2_Op_Fp_Imm_Pat_Zero<ValueType vt, SDPatternOperator op, FPImmLeaf immL, int imm, Instruction inst> : Pat<(vt (op pt:$Pg, (vselect pt:$Pg, vt:$Zs1, (SVEDup0)), - (vt (AArch64dup (it immL))))), + (vt (splat_vector (it immL))))), (inst $Pg, $Zs1, imm)>; +// Used to re-order the operands of BSP when lowering to BSL. BSP has the order: +// mask, in1, in2 whereas BSL for SVE2 has them ordered in1, in2, mask +class SVE_3_Op_BSP_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, + ValueType vt2, ValueType vt3, Instruction inst> +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)), + (inst $Op2, $Op3, $Op1)>; + +class SVE_Shift_Add_All_Active_Pat<ValueType vtd, SDPatternOperator op, ValueType pt, + ValueType vt1, ValueType vt2, ValueType vt3, + Instruction inst> +: Pat<(vtd (add vt1:$Op1, (op (pt (SVEAllActive)), vt2:$Op2, vt3:$Op3))), + (inst $Op1, $Op2, $Op3)>; + +//===----------------------------------------------------------------------===// +// SVE pattern match helpers. +//===----------------------------------------------------------------------===// + +// Matches either an intrinsic, or a predicated operation with an all active predicate +class EitherVSelectOrPassthruPatFrags<SDPatternOperator intrinsic, SDPatternOperator sdnode> +: PatFrags<(ops node:$Pg, node:$Op1, node:$Op2), [ + (intrinsic node:$Pg, node:$Op1, node:$Op2), + (vselect node:$Pg, (sdnode (SVEAllActive), node:$Op1, node:$Op2), node:$Op1), + ]>; + // // Pseudo -> Instruction mappings // @@ -612,10 +643,11 @@ class sve_int_pfalse<bits<6> opc, string asm> multiclass sve_int_pfalse<bits<6> opc, string asm> { def NAME : sve_int_pfalse<opc, asm>; - def : Pat<(nxv16i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; - def : Pat<(nxv8i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; - def : Pat<(nxv4i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; - def : Pat<(nxv2i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; + def : Pat<(nxv16i1 immAllZerosV), (!cast<Instruction>(NAME))>; + def : Pat<(nxv8i1 immAllZerosV), (!cast<Instruction>(NAME))>; + def : Pat<(nxv4i1 immAllZerosV), (!cast<Instruction>(NAME))>; + def : Pat<(nxv2i1 immAllZerosV), (!cast<Instruction>(NAME))>; + def : Pat<(nxv1i1 immAllZerosV), (!cast<Instruction>(NAME))>; } class sve_int_ptest<bits<6> opc, string asm> @@ -885,6 +917,8 @@ class sve_int_count<bits<3> opc, string asm> let Inst{10} = opc{0}; let Inst{9-5} = pattern; let Inst{4-0} = Rd; + + let isReMaterializable = 1; } multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> { @@ -965,7 +999,7 @@ class sve_int_pred_pattern_a<bits<3> opc, string asm> multiclass sve_int_pred_pattern_a<bits<3> opc, string asm, SDPatternOperator op, SDPatternOperator opcnt> { - let Predicates = [HasSVEorStreamingSVE] in { + let Predicates = [HasSVEorSME] in { def NAME : sve_int_pred_pattern_a<opc, asm>; def : InstAlias<asm # "\t$Rdn, $pattern", @@ -974,7 +1008,7 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm, (!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>; } - let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL] in { + let Predicates = [HasSVEorSME, UseScalarIncVL] in { def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))), (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>; @@ -1170,28 +1204,45 @@ multiclass sve_int_perm_dup_i<string asm> { (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>; // Duplicate extracted element of vector into all vector elements - def : Pat<(nxv16i8 (AArch64dup (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))), + def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))), (!cast<Instruction>(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>; - def : Pat<(nxv8i16 (AArch64dup (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + def : Pat<(nxv8i16 (splat_vector (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; - def : Pat<(nxv4i32 (AArch64dup (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + def : Pat<(nxv4i32 (splat_vector (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; - def : Pat<(nxv2i64 (AArch64dup (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + def : Pat<(nxv2i64 (splat_vector (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; - def : Pat<(nxv8f16 (AArch64dup (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + def : Pat<(nxv8f16 (splat_vector (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; - def : Pat<(nxv8bf16 (AArch64dup (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + def : Pat<(nxv8bf16 (splat_vector (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; - def : Pat<(nxv4f16 (AArch64dup (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + def : Pat<(nxv4f16 (splat_vector (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; - def : Pat<(nxv2f16 (AArch64dup (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + def : Pat<(nxv2f16 (splat_vector (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; - def : Pat<(nxv4f32 (AArch64dup (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + def : Pat<(nxv4f32 (splat_vector (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; - def : Pat<(nxv2f32 (AArch64dup (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + def : Pat<(nxv2f32 (splat_vector (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; - def : Pat<(nxv2f64 (AArch64dup (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + def : Pat<(nxv2f64 (splat_vector (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + + def : Pat<(nxv16i8 (AArch64duplane128 nxv16i8:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; + def : Pat<(nxv8i16 (AArch64duplane128 nxv8i16:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; + def : Pat<(nxv4i32 (AArch64duplane128 nxv4i32:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; + def : Pat<(nxv2i64 (AArch64duplane128 nxv2i64:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; + def : Pat<(nxv8f16 (AArch64duplane128 nxv8f16:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; + def : Pat<(nxv4f32 (AArch64duplane128 nxv4f32:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; + def : Pat<(nxv2f64 (AArch64duplane128 nxv2f64:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; + def : Pat<(nxv8bf16 (AArch64duplane128 nxv8bf16:$Op1, i64:$imm)), + (!cast<Instruction>(NAME # _Q) $Op1, $imm)>; } class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty, @@ -1631,6 +1682,7 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op, def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>; def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>; def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>; + def : SVE_3_Op_Pat<nxv1i1, op, nxv1i1, nxv1i1, nxv1i1, !cast<Instruction>(NAME)>; def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1, !cast<Instruction>(NAME), PTRUE_B>; def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1, @@ -1743,7 +1795,7 @@ multiclass sve_int_dup_mask_imm<string asm> { def : InstAlias<"mov $Zd, $imm", (!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>; - def : Pat<(nxv2i64 (AArch64dup (i64 logical_imm64:$imm))), + def : Pat<(nxv2i64 (splat_vector (i64 logical_imm64:$imm))), (!cast<Instruction>(NAME) logical_imm64:$imm)>; } @@ -2478,7 +2530,7 @@ multiclass sve2_fp_mla_long<bits<2> opc, string asm, SDPatternOperator op> { // SVE Stack Allocation Group //===----------------------------------------------------------------------===// -class sve_int_arith_vl<bit opc, string asm> +class sve_int_arith_vl<bit opc, string asm, bit streaming_sve = 0b0> : I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6), asm, "\t$Rd, $Rn, $imm6", "", @@ -2490,12 +2542,13 @@ class sve_int_arith_vl<bit opc, string asm> let Inst{22} = opc; let Inst{21} = 0b1; let Inst{20-16} = Rn; - let Inst{15-11} = 0b01010; + let Inst{15-12} = 0b0101; + let Inst{11} = streaming_sve; let Inst{10-5} = imm6; let Inst{4-0} = Rd; } -class sve_int_read_vl_a<bit op, bits<5> opc2, string asm> +class sve_int_read_vl_a<bit op, bits<5> opc2, string asm, bit streaming_sve = 0b0> : I<(outs GPR64:$Rd), (ins simm6_32b:$imm6), asm, "\t$Rd, $imm6", "", @@ -2506,9 +2559,12 @@ class sve_int_read_vl_a<bit op, bits<5> opc2, string asm> let Inst{22} = op; let Inst{21} = 0b1; let Inst{20-16} = opc2{4-0}; - let Inst{15-11} = 0b01010; + let Inst{15-12} = 0b0101; + let Inst{11} = streaming_sve; let Inst{10-5} = imm6; let Inst{4-0} = Rd; + + let isReMaterializable = 1; } //===----------------------------------------------------------------------===// @@ -2589,8 +2645,8 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm, SDPatternOperator int_op, SDPatternOperator ir_op, ValueType vt1, ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { - def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>; - + def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>, + SVEPseudo2Instr<NAME, 1>; // convert vt1 to a packed type for the intrinsic patterns defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16, !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16, @@ -2604,8 +2660,11 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm, 1 : vt3); def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>; - def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>; + + def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>; + + defm : SVE_1_Op_PassthruUndef_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>; } multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm, @@ -2614,7 +2673,8 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm, SDPatternOperator int_op, SDPatternOperator ir_op, ValueType vt1, ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { - def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>; + def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>, + SVEPseudo2Instr<NAME, 1>; // convert vt1 to a packed type for the intrinsic patterns defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16, @@ -2623,8 +2683,11 @@ multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm, 1 : vt1); def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>; - def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>; + + def _UNDEF : PredOneOpPassthruPseudo<NAME, !cast<ZPRRegOp>(i_zprtype)>; + + defm : SVE_1_Op_PassthruUndef_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME # _UNDEF)>; } multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> { @@ -2726,11 +2789,19 @@ class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc, let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> { - def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_log<bits<3> opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>, + SVEPseudo2Instr<Ps # _B, 1>; + def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>, + SVEPseudo2Instr<Ps # _H, 1>; + def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>, + SVEPseudo2Instr<Ps # _S, 1>; + def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>, + SVEPseudo2Instr<Ps # _D, 1>; + } def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; @@ -3756,7 +3827,8 @@ class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, } multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm, - SDPatternOperator op> { + SDPatternOperator op, + SDPatternOperator shift_op = null_frag> { def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -3773,6 +3845,11 @@ multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm, def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>; + + def : SVE_Shift_Add_All_Active_Pat<nxv16i8, shift_op, nxv16i1, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME # _B)>; + def : SVE_Shift_Add_All_Active_Pat<nxv8i16, shift_op, nxv8i1, nxv8i16, nxv8i16, i32, !cast<Instruction>(NAME # _H)>; + def : SVE_Shift_Add_All_Active_Pat<nxv4i32, shift_op, nxv4i1, nxv4i32, nxv4i32, i32, !cast<Instruction>(NAME # _S)>; + def : SVE_Shift_Add_All_Active_Pat<nxv2i64, shift_op, nxv2i1, nxv2i64, nxv2i64, i32, !cast<Instruction>(NAME # _D)>; } class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty> @@ -4331,18 +4408,6 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> { def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> { - def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; - def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>; - def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>; - def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>; - - def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv16i8, op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>; -} - class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm, ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm), @@ -4458,7 +4523,8 @@ class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm> let ElementSize = ElementSizeNone; } -multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> { +multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op, + SDPatternOperator ir_op = null_frag> { def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>; def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk", @@ -4472,6 +4538,12 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperato def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>; def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>; def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>; + + + def : SVE_3_Op_BSP_Pat<nxv16i8, ir_op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>; + def : SVE_3_Op_BSP_Pat<nxv8i16, ir_op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>; + def : SVE_3_Op_BSP_Pat<nxv4i32, ir_op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>; + def : SVE_3_Op_BSP_Pat<nxv2i64, ir_op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>; } class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm, @@ -4578,29 +4650,28 @@ class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm, } multiclass sve_int_dup_imm_pred_merge_inst< - bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, - ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm, + ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> { let Constraints = "$Zd = $_Zd" in def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty, "/m", (ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>; def : InstAlias<"mov $Zd, $Pg/m, $imm", (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; - def : Pat<(intty - (vselect predty:$Pg, - (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))), - intty:$Zd)), - (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>; + def : Pat<(vselect predty:$Pg, + (intty (splat_vector (scalarty (cpx i32:$imm, i32:$shift)))), + ZPR:$Zd), + (!cast<Instruction>(NAME) $Zd, $Pg, $imm, $shift)>; } multiclass sve_int_dup_imm_pred_merge<string asm> { - defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, - i32, cpy_imm8_opt_lsl_i8>; - defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, - i32, cpy_imm8_opt_lsl_i16>; - defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, - i32, cpy_imm8_opt_lsl_i32>; - defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, - i64, cpy_imm8_opt_lsl_i64>; + defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8, + nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>; + defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16, + nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>; + defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32, + nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>; + defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64, + nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>; @@ -4608,11 +4679,24 @@ multiclass sve_int_dup_imm_pred_merge<string asm> { (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>; + + def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv8f16 ZPR:$Zd)), + (!cast<Instruction>(NAME # _H) $Zd, $Pg, 0, 0)>; + def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv4f16 ZPR:$Zd)), + (!cast<Instruction>(NAME # _S) $Zd, $Pg, 0, 0)>; + def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f16 ZPR:$Zd)), + (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>; + def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv4f32 ZPR:$Zd)), + (!cast<Instruction>(NAME # _S) $Zd, $Pg, 0, 0)>; + def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f32 ZPR:$Zd)), + (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>; + def : Pat<(vselect PPRAny:$Pg, (SVEDup0), (nxv2f64 ZPR:$Zd)), + (!cast<Instruction>(NAME # _D) $Zd, $Pg, 0, 0)>; } multiclass sve_int_dup_imm_pred_zero_inst< - bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, - ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm, + ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> { def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z", (ins PPRAny:$Pg, cpyimm:$imm)>; def : InstAlias<"mov $Zd, $Pg/z, $imm", @@ -4623,22 +4707,21 @@ multiclass sve_int_dup_imm_pred_zero_inst< (!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>; def : Pat<(intty (anyext (predty PPRAny:$Ps1))), (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>; - def : Pat<(intty - (vselect predty:$Pg, - (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))), - (intty (AArch64dup (scalarty 0))))), - (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>; + def : Pat<(vselect predty:$Pg, + (intty (splat_vector (scalarty (cpx i32:$imm, i32:$shift)))), + (intty (splat_vector (scalarty 0)))), + (!cast<Instruction>(NAME) $Pg, $imm, $shift)>; } multiclass sve_int_dup_imm_pred_zero<string asm> { - defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, - i32, cpy_imm8_opt_lsl_i8>; - defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, - i32, cpy_imm8_opt_lsl_i16>; - defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, - i32, cpy_imm8_opt_lsl_i32>; - defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, - i64, cpy_imm8_opt_lsl_i64>; + defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8, + nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>; + defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16, + nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>; + defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32, + nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>; + defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64, + nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>; } //===----------------------------------------------------------------------===// @@ -4690,6 +4773,10 @@ multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt (cmp $Op1, $Op2)>; def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)), (cmp $Op1, $Op2)>; + def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op1, (SVEDup0), cc))), + (cmp $Pg, $Op1)>; + def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), (SVEDup0), intvt:$Op1, invcc))), + (cmp $Pg, $Op1)>; } multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> { @@ -4761,14 +4848,26 @@ multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc, ValueType predvt, ValueType intvt, Operand immtype, Instruction cmp> { def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), - (intvt ZPR:$Zs1), - (intvt (AArch64dup (immtype:$imm))), - cc)), + (intvt ZPR:$Zs1), + (intvt (splat_vector (immtype:$imm))), + cc)), (cmp $Pg, $Zs1, immtype:$imm)>; def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), - (intvt (AArch64dup (immtype:$imm))), - (intvt ZPR:$Zs1), - commuted_cc)), + (intvt (splat_vector (immtype:$imm))), + (intvt ZPR:$Zs1), + commuted_cc)), + (cmp $Pg, $Zs1, immtype:$imm)>; + def : Pat<(predvt (and predvt:$Pg, + (AArch64setcc_z (predvt (AArch64ptrue 31)), + (intvt ZPR:$Zs1), + (intvt (splat_vector (immtype:$imm))), + cc))), + (cmp $Pg, $Zs1, immtype:$imm)>; + def : Pat<(predvt (and predvt:$Pg, + (AArch64setcc_z (predvt (AArch64ptrue 31)), + (intvt (splat_vector (immtype:$imm))), + (intvt ZPR:$Zs1), + commuted_cc))), (cmp $Pg, $Zs1, immtype:$imm)>; } @@ -5148,6 +5247,8 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{15-10} = 0b010000; let Inst{9-5} = imm5; let Inst{4-0} = Zd; + + let isReMaterializable = 1; } multiclass sve_int_index_ii<string asm> { @@ -5166,13 +5267,13 @@ multiclass sve_int_index_ii<string asm> { (!cast<Instruction>(NAME # "_D") (i64 0), simm5_64b:$imm5b)>; // add(step_vector(step), dup(X)) -> index(X, step). - def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))), + def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (splat_vector(simm5_8b:$imm5)))), (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>; - def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))), + def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (splat_vector(simm5_16b:$imm5)))), (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<SDNodeXForm>("trunc_imm") $imm5b))>; - def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))), + def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (splat_vector(simm5_32b:$imm5)))), (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>; - def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))), (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>; } @@ -5211,35 +5312,35 @@ multiclass sve_int_index_ir<string asm, SDPatternOperator mulop, SDPatternOperat (!cast<Instruction>(NAME # "_D") (i64 0), (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>; // add(step_vector(step), dup(X)) -> index(X, step). - def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))), + def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (splat_vector(simm5_8b:$imm5)))), (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>; - def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))), + def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (splat_vector(simm5_16b:$imm5)))), (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>; - def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))), + def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (splat_vector(simm5_32b:$imm5)))), (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, (!cast<Instruction>("MOVi32imm") $imm))>; - def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))), (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (!cast<Instruction>("MOVi64imm") $imm))>; - def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (splat_vector(simm5_64b:$imm5)))), (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>; // mul(step_vector(1), dup(Y)) -> index(0, Y). - def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), + def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (!cast<Instruction>(NAME # "_B") (i32 0), GPR32:$Rm)>; - def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), + def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (!cast<Instruction>(NAME # "_H") (i32 0), GPR32:$Rm)>; - def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), + def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (!cast<Instruction>(NAME # "_S") (i32 0), GPR32:$Rm)>; - def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), + def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (!cast<Instruction>(NAME # "_D") (i64 0), GPR64:$Rm)>; // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y). - def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))), + def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(simm5_8b:$imm5)))), (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>; - def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))), + def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (nxv8i16 (splat_vector(simm5_16b:$imm5)))), (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>; - def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))), + def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (nxv4i32 (splat_vector(simm5_32b:$imm5)))), (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>; - def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (nxv2i64 (splat_vector(simm5_64b:$imm5)))), (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>; } @@ -5267,13 +5368,13 @@ multiclass sve_int_index_ri<string asm> { def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>; // add(step_vector(step), dup(X)) -> index(X, step). - def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), + def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (!cast<Instruction>(NAME # "_B") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>; - def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), + def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (!cast<Instruction>(NAME # "_H") GPR32:$Rm, (!cast<SDNodeXForm>("trunc_imm") $imm5))>; - def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), + def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>; - def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), + def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>; } @@ -5301,25 +5402,25 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator mulop> { def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>; // add(step_vector(step), dup(X)) -> index(X, step). - def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))), + def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))), (!cast<Instruction>(NAME # "_B") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>; - def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(i32 GPR32:$Rn)))), + def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (splat_vector(i32 GPR32:$Rn)))), (!cast<Instruction>(NAME # "_H") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)))>; - def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(i32 GPR32:$Rn)))), + def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (splat_vector(i32 GPR32:$Rn)))), (!cast<Instruction>(NAME # "_S") GPR32:$Rn, (!cast<Instruction>("MOVi32imm") $imm))>; - def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), + def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (splat_vector(i64 GPR64:$Rn)))), (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (!cast<Instruction>("MOVi64imm") $imm))>; - def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), + def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (splat_vector(i64 GPR64:$Rn)))), (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>; // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y). - def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))), + def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))), (!cast<Instruction>(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>; - def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))), + def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),(nxv8i16 (splat_vector(i32 GPR32:$Rn)))), (!cast<Instruction>(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>; - def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),(nxv4i32 (AArch64dup(i32 GPR32:$Rn)))), + def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),(nxv4i32 (splat_vector(i32 GPR32:$Rn)))), (!cast<Instruction>(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>; - def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),(nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), + def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),(nxv2i64 (splat_vector(i64 GPR64:$Rn)))), (!cast<Instruction>(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>; } @@ -5972,25 +6073,25 @@ multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm, SDPatternOperator op, RegisterOperand zprext, ValueType vt> { - def _SCALED_REAL : sve_mem_sst_sv2<msz, 1, asm, zprext>; + def _SCALED : sve_mem_sst_sv2<msz, 1, asm, zprext>; def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]", - (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + (!cast<Instruction>(NAME # _SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt), - (!cast<Instruction>(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast<Instruction>(NAME # _SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm, SDPatternOperator op, ValueType vt> { - def _REAL : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>; + def NAME : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>; def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]", - (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), - (!cast<Instruction>(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast<Instruction>(NAME) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty, @@ -8433,6 +8534,7 @@ def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", [ def am_sve_regreg_lsl1 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<1>", []>; def am_sve_regreg_lsl2 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<2>", []>; def am_sve_regreg_lsl3 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<3>", []>; +def am_sve_regreg_lsl4 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<4>", []>; // Predicated pseudo floating point two operand instructions. multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> { diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 4a24162540a5..ccb34f367338 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -305,8 +305,7 @@ bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) { // ..where the value stored comes from a vector extract.. auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0)); - if (!IntrI || - IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract) + if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::vector_extract) return false; // ..that is extracting from index 0.. @@ -365,8 +364,7 @@ bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) { // ..whose operand is a vector_insert.. auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0)); - if (!IntrI || - IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert) + if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::vector_insert) return false; // ..that is inserting into index zero of an undef vector.. @@ -451,8 +449,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) { continue; switch (F.getIntrinsicID()) { - case Intrinsic::experimental_vector_extract: - case Intrinsic::experimental_vector_insert: + case Intrinsic::vector_extract: + case Intrinsic::vector_insert: case Intrinsic::aarch64_sve_ptrue: for (User *U : F.users()) Functions.insert(cast<Instruction>(U)->getFunction()); diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 5906a5d6b50b..71303611265c 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -634,7 +634,8 @@ namespace AArch64SysReg { FeatureBitset FeaturesRequired; bool haveFeatures(FeatureBitset ActiveFeatures) const { - return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + return ActiveFeatures[llvm::AArch64::FeatureAll] || + (FeaturesRequired & ActiveFeatures) == FeaturesRequired; } }; |
