diff options
Diffstat (limited to 'test/CodeGen/AArch64')
148 files changed, 28157 insertions, 811 deletions
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll index 7cb373232a2cc..26fd3e66b798c 100644 --- a/test/CodeGen/AArch64/adc.ll +++ b/test/CodeGen/AArch64/adc.ll @@ -1,7 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s define i128 @test_simple(i128 %a, i128 %b, i128 %c) { -; CHECK: test_simple: +; CHECK-LABEL: test_simple: %valadd = add i128 %a, %b ; CHECK: adds [[ADDLO:x[0-9]+]], x0, x2 @@ -16,7 +16,7 @@ define i128 @test_simple(i128 %a, i128 %b, i128 %c) { } define i128 @test_imm(i128 %a) { -; CHECK: test_imm: +; CHECK-LABEL: test_imm: %val = add i128 %a, 12 ; CHECK: adds x0, x0, #12 @@ -27,7 +27,7 @@ define i128 @test_imm(i128 %a) { } define i128 @test_shifted(i128 %a, i128 %b) { -; CHECK: test_shifted: +; CHECK-LABEL: test_shifted: %rhs = shl i128 %b, 45 @@ -40,7 +40,7 @@ define i128 @test_shifted(i128 %a, i128 %b) { } define i128 @test_extended(i128 %a, i16 %b) { -; CHECK: test_extended: +; CHECK-LABEL: test_extended: %ext = sext i16 %b to i128 %rhs = shl i128 %ext, 3 diff --git a/test/CodeGen/AArch64/addsub-shifted.ll b/test/CodeGen/AArch64/addsub-shifted.ll index f2c74f6952b07..269c1e8143b27 100644 --- a/test/CodeGen/AArch64/addsub-shifted.ll +++ b/test/CodeGen/AArch64/addsub-shifted.ll @@ -4,7 +4,7 @@ @var64 = global i64 0 define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { -; CHECK: test_lsl_arith: +; CHECK-LABEL: test_lsl_arith: %rhs1 = load volatile i32* @var32 %shift1 = shl i32 %rhs1, 18 @@ -73,7 +73,7 @@ define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { } define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { -; CHECK: test_lsr_arith: +; CHECK-LABEL: test_lsr_arith: %shift1 = lshr i32 %rhs32, 18 %val1 = add i32 %lhs32, %shift1 @@ -132,7 +132,7 @@ define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { } define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { -; CHECK: test_asr_arith: +; CHECK-LABEL: test_asr_arith: %shift1 = ashr i32 %rhs32, 18 %val1 = add i32 %lhs32, %shift1 @@ -191,7 +191,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { } define i32 @test_cmp(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { -; CHECK: test_cmp: +; CHECK-LABEL: test_cmp: %shift1 = shl i32 %rhs32, 13 %tst1 = icmp uge i32 %lhs32, %shift1 @@ -237,7 +237,7 @@ end: } define i32 @test_cmn(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) { -; CHECK: test_cmn: +; CHECK-LABEL: test_cmn: %shift1 = shl i32 %rhs32, 13 %val1 = sub i32 0, %shift1 diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll index 5148807163c9a..4d46d04b80f1f 100644 --- a/test/CodeGen/AArch64/addsub.ll +++ b/test/CodeGen/AArch64/addsub.ll @@ -9,7 +9,7 @@ ; Add pure 12-bit immediates: define void @add_small() { -; CHECK: add_small: +; CHECK-LABEL: add_small: ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #4095 %val32 = load i32* @var_i32 @@ -26,7 +26,7 @@ define void @add_small() { ; Add 12-bit immediates, shifted left by 12 bits define void @add_med() { -; CHECK: add_med: +; CHECK-LABEL: add_med: ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12 %val32 = load i32* @var_i32 @@ -43,7 +43,7 @@ define void @add_med() { ; Subtract 12-bit immediates define void @sub_small() { -; CHECK: sub_small: +; CHECK-LABEL: sub_small: ; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #4095 %val32 = load i32* @var_i32 @@ -60,7 +60,7 @@ define void @sub_small() { ; Subtract 12-bit immediates, shifted left by 12 bits define void @sub_med() { -; CHECK: sub_med: +; CHECK-LABEL: sub_med: ; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12 %val32 = load i32* @var_i32 @@ -76,7 +76,7 @@ define void @sub_med() { } define void @testing() { -; CHECK: testing: +; CHECK-LABEL: testing: %val = load i32* @var_i32 ; CHECK: cmp {{w[0-9]+}}, #4095 diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll index 2dd16626ea9f2..f0e11c652240c 100644 --- a/test/CodeGen/AArch64/addsub_ext.ll +++ b/test/CodeGen/AArch64/addsub_ext.ll @@ -6,7 +6,7 @@ @var64 = global i64 0 define void @addsub_i8rhs() { -; CHECK: addsub_i8rhs: +; CHECK-LABEL: addsub_i8rhs: %val8_tmp = load i8* @var8 %lhs32 = load i32* @var32 %lhs64 = load i64* @var64 @@ -81,7 +81,7 @@ end: } define void @addsub_i16rhs() { -; CHECK: addsub_i16rhs: +; CHECK-LABEL: addsub_i16rhs: %val16_tmp = load i16* @var16 %lhs32 = load i32* @var32 %lhs64 = load i64* @var64 @@ -159,7 +159,7 @@ end: ; example), but the remaining instructions are probably not idiomatic ; in the face of "add/sub (shifted register)" so I don't intend to. define void @addsub_i32rhs() { -; CHECK: addsub_i32rhs: +; CHECK-LABEL: addsub_i32rhs: %val32_tmp = load i32* @var32 %lhs64 = load i64* @var64 @@ -186,4 +186,4 @@ define void @addsub_i32rhs() { ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw #2 ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/adrp-relocation.ll b/test/CodeGen/AArch64/adrp-relocation.ll deleted file mode 100644 index cf411166a3a04..0000000000000 --- a/test/CodeGen/AArch64/adrp-relocation.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -filetype=obj < %s | llvm-readobj -s -r | FileCheck %s - -define i64 @testfn() nounwind { -entry: - ret i64 0 -} - -define i64 @foo() nounwind { -entry: - %bar = alloca i64 ()*, align 8 - store i64 ()* @testfn, i64 ()** %bar, align 8 - %call = call i64 @testfn() - ret i64 %call -} - -; The above should produce an ADRP/ADD pair to calculate the address of -; testfn. The important point is that LLVM shouldn't think it can deal with the -; relocation on the ADRP itself (even though it knows everything about the -; relative offsets of testfn and foo) because its value depends on where this -; object file's .text section gets relocated in memory. - -; CHECK: Relocations [ -; CHECK-NEXT: Section (1) .text { -; CHECK-NEXT: 0x10 R_AARCH64_ADR_PREL_PG_HI21 testfn 0x0 -; CHECK-NEXT: 0x14 R_AARCH64_ADD_ABS_LO12_NC testfn 0x0 -; CHECK-NEXT: } -; CHECK-NEXT: ] diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll index c62edf6503c6b..1d3c0a02ac879 100644 --- a/test/CodeGen/AArch64/alloca.ll +++ b/test/CodeGen/AArch64/alloca.ll @@ -1,19 +1,20 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s declare void @use_addr(i8*) define void @test_simple_alloca(i64 %n) { -; CHECK: test_simple_alloca: +; CHECK-LABEL: test_simple_alloca: %buf = alloca i8, i64 %n ; Make sure we align the stack change to 16 bytes: -; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15 -; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0 +; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15 +; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0 ; Make sure we change SP. It would be surprising if anything but x0 were used ; for the final sp, but it could be if it was then moved into x0. -; CHECK: mov [[TMP:x[0-9]+]], sp -; CHECK: sub x0, [[TMP]], [[SPDELTA]] +; CHECK-DAG: mov [[TMP:x[0-9]+]], sp +; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]] ; CHECK: mov sp, x0 call void @use_addr(i8* %buf) @@ -30,20 +31,20 @@ define void @test_simple_alloca(i64 %n) { declare void @use_addr_loc(i8*, i64*) define i64 @test_alloca_with_local(i64 %n) { -; CHECK: test_alloca_with_local: +; CHECK-LABEL: test_alloca_with_local: ; CHECK: sub sp, sp, #32 ; CHECK: stp x29, x30, [sp, #16] %loc = alloca i64 %buf = alloca i8, i64 %n ; Make sure we align the stack change to 16 bytes: -; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15 -; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0 +; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15 +; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0 ; Make sure we change SP. It would be surprising if anything but x0 were used ; for the final sp, but it could be if it was then moved into x0. -; CHECK: mov [[TMP:x[0-9]+]], sp -; CHECK: sub x0, [[TMP]], [[SPDELTA]] +; CHECK-DAG: mov [[TMP:x[0-9]+]], sp +; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]] ; CHECK: mov sp, x0 ; Obviously suboptimal code here, but it to get &local in x1 @@ -73,9 +74,15 @@ define void @test_variadic_alloca(i64 %n, ...) { ; CHECK: add x29, sp, #192 ; CHECK: sub [[TMP:x[0-9]+]], x29, #192 ; CHECK: add x8, [[TMP]], #0 -; CHECK: str q7, [x8, #112] +; CHECK-FP: str q7, [x8, #112] ; [...] -; CHECK: str q1, [x8, #16] +; CHECK-FP: str q1, [x8, #16] + +; CHECK-NOFP: sub sp, sp, #80 +; CHECK-NOFP: stp x29, x30, [sp, #64] +; CHECK-NOFP: add x29, sp, #64 +; CHECK-NOFP: sub [[TMP:x[0-9]+]], x29, #64 +; CHECK-NOFP: add x8, [[TMP]], #0 %addr = alloca i8, i64 %n @@ -86,10 +93,14 @@ define void @test_variadic_alloca(i64 %n, ...) { ; CHECK: sub sp, x29, #192 ; CHECK: ldp x29, x30, [sp, #192] ; CHECK: add sp, sp, #208 + +; CHECK-NOFP: sub sp, x29, #64 +; CHECK-NOFP: ldp x29, x30, [sp, #64] +; CHECK-NOFP: add sp, sp, #80 } define void @test_alloca_large_frame(i64 %n) { -; CHECK: test_alloca_large_frame: +; CHECK-LABEL: test_alloca_large_frame: ; CHECK: sub sp, sp, #496 ; CHECK: stp x29, x30, [sp, #480] @@ -112,16 +123,16 @@ declare i8* @llvm.stacksave() declare void @llvm.stackrestore(i8*) define void @test_scoped_alloca(i64 %n) { -; CHECK: test_scoped_alloca +; CHECK-LABEL: test_scoped_alloca: ; CHECK: sub sp, sp, #32 %sp = call i8* @llvm.stacksave() ; CHECK: mov [[SAVED_SP:x[0-9]+]], sp +; CHECK: mov [[OLDSP:x[0-9]+]], sp %addr = alloca i8, i64 %n ; CHECK: and [[SPDELTA:x[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0 -; CHECK: mov [[OLDSP:x[0-9]+]], sp -; CHECK: sub [[NEWSP:x[0-9]+]], [[OLDSP]], [[SPDELTA]] +; CHECK-DAG: sub [[NEWSP:x[0-9]+]], [[OLDSP]], [[SPDELTA]] ; CHECK: mov sp, [[NEWSP]] call void @use_addr(i8* %addr) diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll index e10bbb0f86912..36bc2e00d2389 100644 --- a/test/CodeGen/AArch64/analyze-branch.ll +++ b/test/CodeGen/AArch64/analyze-branch.ll @@ -11,7 +11,7 @@ declare void @test_false() !1 = metadata !{metadata !"branch_weights", i32 4, i32 64} define void @test_Bcc_fallthrough_taken(i32 %in) nounwind { -; CHECK: test_Bcc_fallthrough_taken: +; CHECK-LABEL: test_Bcc_fallthrough_taken: %tst = icmp eq i32 %in, 42 br i1 %tst, label %true, label %false, !prof !0 @@ -34,7 +34,7 @@ false: } define void @test_Bcc_fallthrough_nottaken(i32 %in) nounwind { -; CHECK: test_Bcc_fallthrough_nottaken: +; CHECK-LABEL: test_Bcc_fallthrough_nottaken: %tst = icmp eq i32 %in, 42 br i1 %tst, label %true, label %false, !prof !1 @@ -57,7 +57,7 @@ false: } define void @test_CBZ_fallthrough_taken(i32 %in) nounwind { -; CHECK: test_CBZ_fallthrough_taken: +; CHECK-LABEL: test_CBZ_fallthrough_taken: %tst = icmp eq i32 %in, 0 br i1 %tst, label %true, label %false, !prof !0 @@ -78,7 +78,7 @@ false: } define void @test_CBZ_fallthrough_nottaken(i64 %in) nounwind { -; CHECK: test_CBZ_fallthrough_nottaken: +; CHECK-LABEL: test_CBZ_fallthrough_nottaken: %tst = icmp eq i64 %in, 0 br i1 %tst, label %true, label %false, !prof !1 @@ -99,7 +99,7 @@ false: } define void @test_CBNZ_fallthrough_taken(i32 %in) nounwind { -; CHECK: test_CBNZ_fallthrough_taken: +; CHECK-LABEL: test_CBNZ_fallthrough_taken: %tst = icmp ne i32 %in, 0 br i1 %tst, label %true, label %false, !prof !0 @@ -120,7 +120,7 @@ false: } define void @test_CBNZ_fallthrough_nottaken(i64 %in) nounwind { -; CHECK: test_CBNZ_fallthrough_nottaken: +; CHECK-LABEL: test_CBNZ_fallthrough_nottaken: %tst = icmp ne i64 %in, 0 br i1 %tst, label %true, label %false, !prof !1 @@ -141,7 +141,7 @@ false: } define void @test_TBZ_fallthrough_taken(i32 %in) nounwind { -; CHECK: test_TBZ_fallthrough_taken: +; CHECK-LABEL: test_TBZ_fallthrough_taken: %bit = and i32 %in, 32768 %tst = icmp eq i32 %bit, 0 br i1 %tst, label %true, label %false, !prof !0 @@ -163,7 +163,7 @@ false: } define void @test_TBZ_fallthrough_nottaken(i64 %in) nounwind { -; CHECK: test_TBZ_fallthrough_nottaken: +; CHECK-LABEL: test_TBZ_fallthrough_nottaken: %bit = and i64 %in, 32768 %tst = icmp eq i64 %bit, 0 br i1 %tst, label %true, label %false, !prof !1 @@ -186,7 +186,7 @@ false: define void @test_TBNZ_fallthrough_taken(i32 %in) nounwind { -; CHECK: test_TBNZ_fallthrough_taken: +; CHECK-LABEL: test_TBNZ_fallthrough_taken: %bit = and i32 %in, 32768 %tst = icmp ne i32 %bit, 0 br i1 %tst, label %true, label %false, !prof !0 @@ -208,7 +208,7 @@ false: } define void @test_TBNZ_fallthrough_nottaken(i64 %in) nounwind { -; CHECK: test_TBNZ_fallthrough_nottaken: +; CHECK-LABEL: test_TBNZ_fallthrough_nottaken: %bit = and i64 %in, 32768 %tst = icmp ne i64 %bit, 0 br i1 %tst, label %true, label %false, !prof !1 diff --git a/test/CodeGen/AArch64/atomic-ops-not-barriers.ll b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll index 9888a742e32b8..da095a0a42c5f 100644 --- a/test/CodeGen/AArch64/atomic-ops-not-barriers.ll +++ b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s define i32 @foo(i32* %var, i1 %cond) { -; CHECK: foo: +; CHECK-LABEL: foo: br i1 %cond, label %atomic_ver, label %simple_ver simple_ver: %oldval = load i32* %var diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll index 5e87f21a217d8..de84ff46ec3b9 100644 --- a/test/CodeGen/AArch64/atomic-ops.ll +++ b/test/CodeGen/AArch64/atomic-ops.ll @@ -6,7 +6,7 @@ @var64 = global i64 0 define i8 @test_atomic_load_add_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_add_i8: +; CHECK-LABEL: test_atomic_load_add_i8: %old = atomicrmw add i8* @var8, i8 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -26,7 +26,7 @@ define i8 @test_atomic_load_add_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_add_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_add_i16: +; CHECK-LABEL: test_atomic_load_add_i16: %old = atomicrmw add i16* @var16, i16 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -46,7 +46,7 @@ define i16 @test_atomic_load_add_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_add_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_add_i32: +; CHECK-LABEL: test_atomic_load_add_i32: %old = atomicrmw add i32* @var32, i32 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -66,7 +66,7 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_add_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_add_i64: +; CHECK-LABEL: test_atomic_load_add_i64: %old = atomicrmw add i64* @var64, i64 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -86,7 +86,7 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_sub_i8: +; CHECK-LABEL: test_atomic_load_sub_i8: %old = atomicrmw sub i8* @var8, i8 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -106,7 +106,7 @@ define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_sub_i16: +; CHECK-LABEL: test_atomic_load_sub_i16: %old = atomicrmw sub i16* @var16, i16 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -126,7 +126,7 @@ define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_sub_i32: +; CHECK-LABEL: test_atomic_load_sub_i32: %old = atomicrmw sub i32* @var32, i32 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -146,7 +146,7 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_sub_i64: +; CHECK-LABEL: test_atomic_load_sub_i64: %old = atomicrmw sub i64* @var64, i64 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -166,7 +166,7 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_and_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_and_i8: +; CHECK-LABEL: test_atomic_load_and_i8: %old = atomicrmw and i8* @var8, i8 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -186,7 +186,7 @@ define i8 @test_atomic_load_and_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_and_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_and_i16: +; CHECK-LABEL: test_atomic_load_and_i16: %old = atomicrmw and i16* @var16, i16 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -206,7 +206,7 @@ define i16 @test_atomic_load_and_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_and_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_and_i32: +; CHECK-LABEL: test_atomic_load_and_i32: %old = atomicrmw and i32* @var32, i32 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -226,7 +226,7 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_and_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_and_i64: +; CHECK-LABEL: test_atomic_load_and_i64: %old = atomicrmw and i64* @var64, i64 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -246,7 +246,7 @@ define i64 @test_atomic_load_and_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_or_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_or_i8: +; CHECK-LABEL: test_atomic_load_or_i8: %old = atomicrmw or i8* @var8, i8 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -266,7 +266,7 @@ define i8 @test_atomic_load_or_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_or_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_or_i16: +; CHECK-LABEL: test_atomic_load_or_i16: %old = atomicrmw or i16* @var16, i16 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -286,7 +286,7 @@ define i16 @test_atomic_load_or_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_or_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_or_i32: +; CHECK-LABEL: test_atomic_load_or_i32: %old = atomicrmw or i32* @var32, i32 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -306,7 +306,7 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_or_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_or_i64: +; CHECK-LABEL: test_atomic_load_or_i64: %old = atomicrmw or i64* @var64, i64 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -326,7 +326,7 @@ define i64 @test_atomic_load_or_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_xor_i8: +; CHECK-LABEL: test_atomic_load_xor_i8: %old = atomicrmw xor i8* @var8, i8 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -346,7 +346,7 @@ define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_xor_i16: +; CHECK-LABEL: test_atomic_load_xor_i16: %old = atomicrmw xor i16* @var16, i16 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -366,7 +366,7 @@ define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_xor_i32: +; CHECK-LABEL: test_atomic_load_xor_i32: %old = atomicrmw xor i32* @var32, i32 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -386,7 +386,7 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_xor_i64: +; CHECK-LABEL: test_atomic_load_xor_i64: %old = atomicrmw xor i64* @var64, i64 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -406,7 +406,7 @@ define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_xchg_i8: +; CHECK-LABEL: test_atomic_load_xchg_i8: %old = atomicrmw xchg i8* @var8, i8 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -425,7 +425,7 @@ define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_xchg_i16: +; CHECK-LABEL: test_atomic_load_xchg_i16: %old = atomicrmw xchg i16* @var16, i16 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -444,7 +444,7 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_xchg_i32: +; CHECK-LABEL: test_atomic_load_xchg_i32: %old = atomicrmw xchg i32* @var32, i32 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -463,7 +463,7 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_xchg_i64: +; CHECK-LABEL: test_atomic_load_xchg_i64: %old = atomicrmw xchg i64* @var64, i64 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -483,7 +483,7 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind { define i8 @test_atomic_load_min_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_min_i8: +; CHECK-LABEL: test_atomic_load_min_i8: %old = atomicrmw min i8* @var8, i8 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -504,7 +504,7 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_min_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_min_i16: +; CHECK-LABEL: test_atomic_load_min_i16: %old = atomicrmw min i16* @var16, i16 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -525,7 +525,7 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_min_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_min_i32: +; CHECK-LABEL: test_atomic_load_min_i32: %old = atomicrmw min i32* @var32, i32 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -546,7 +546,7 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_min_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_min_i64: +; CHECK-LABEL: test_atomic_load_min_i64: %old = atomicrmw min i64* @var64, i64 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -567,7 +567,7 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_max_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_max_i8: +; CHECK-LABEL: test_atomic_load_max_i8: %old = atomicrmw max i8* @var8, i8 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -588,7 +588,7 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_max_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_max_i16: +; CHECK-LABEL: test_atomic_load_max_i16: %old = atomicrmw max i16* @var16, i16 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -609,7 +609,7 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_max_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_max_i32: +; CHECK-LABEL: test_atomic_load_max_i32: %old = atomicrmw max i32* @var32, i32 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -630,7 +630,7 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_max_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_max_i64: +; CHECK-LABEL: test_atomic_load_max_i64: %old = atomicrmw max i64* @var64, i64 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -651,7 +651,7 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_umin_i8: +; CHECK-LABEL: test_atomic_load_umin_i8: %old = atomicrmw umin i8* @var8, i8 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -672,7 +672,7 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_umin_i16: +; CHECK-LABEL: test_atomic_load_umin_i16: %old = atomicrmw umin i16* @var16, i16 %offset acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -693,7 +693,7 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_umin_i32: +; CHECK-LABEL: test_atomic_load_umin_i32: %old = atomicrmw umin i32* @var32, i32 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -714,7 +714,7 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_umin_i64: +; CHECK-LABEL: test_atomic_load_umin_i64: %old = atomicrmw umin i64* @var64, i64 %offset acq_rel ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -735,7 +735,7 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind { } define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind { -; CHECK: test_atomic_load_umax_i8: +; CHECK-LABEL: test_atomic_load_umax_i8: %old = atomicrmw umax i8* @var8, i8 %offset acq_rel ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -756,7 +756,7 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind { } define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind { -; CHECK: test_atomic_load_umax_i16: +; CHECK-LABEL: test_atomic_load_umax_i16: %old = atomicrmw umax i16* @var16, i16 %offset monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -777,7 +777,7 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind { } define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind { -; CHECK: test_atomic_load_umax_i32: +; CHECK-LABEL: test_atomic_load_umax_i32: %old = atomicrmw umax i32* @var32, i32 %offset seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -798,7 +798,7 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind { } define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind { -; CHECK: test_atomic_load_umax_i64: +; CHECK-LABEL: test_atomic_load_umax_i64: %old = atomicrmw umax i64* @var64, i64 %offset release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -819,7 +819,7 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind { } define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind { -; CHECK: test_atomic_cmpxchg_i8: +; CHECK-LABEL: test_atomic_cmpxchg_i8: %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -841,7 +841,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind { } define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind { -; CHECK: test_atomic_cmpxchg_i16: +; CHECK-LABEL: test_atomic_cmpxchg_i16: %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 @@ -863,7 +863,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind { } define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { -; CHECK: test_atomic_cmpxchg_i32: +; CHECK-LABEL: test_atomic_cmpxchg_i32: %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 @@ -885,7 +885,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind { } define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { -; CHECK: test_atomic_cmpxchg_i64: +; CHECK-LABEL: test_atomic_cmpxchg_i64: %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 @@ -907,7 +907,7 @@ define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { } define i8 @test_atomic_load_monotonic_i8() nounwind { -; CHECK: test_atomic_load_monotonic_i8: +; CHECK-LABEL: test_atomic_load_monotonic_i8: %val = load atomic i8* @var8 monotonic, align 1 ; CHECK-NOT: dmb ; CHECK: adrp x[[HIADDR:[0-9]+]], var8 @@ -918,7 +918,7 @@ define i8 @test_atomic_load_monotonic_i8() nounwind { } define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind { -; CHECK: test_atomic_load_monotonic_regoff_i8: +; CHECK-LABEL: test_atomic_load_monotonic_regoff_i8: %addr_int = add i64 %base, %off %addr = inttoptr i64 %addr_int to i8* @@ -931,7 +931,7 @@ define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind { } define i8 @test_atomic_load_acquire_i8() nounwind { -; CHECK: test_atomic_load_acquire_i8: +; CHECK-LABEL: test_atomic_load_acquire_i8: %val = load atomic i8* @var8 acquire, align 1 ; CHECK-NOT: dmb ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 @@ -944,7 +944,7 @@ define i8 @test_atomic_load_acquire_i8() nounwind { } define i8 @test_atomic_load_seq_cst_i8() nounwind { -; CHECK: test_atomic_load_seq_cst_i8: +; CHECK-LABEL: test_atomic_load_seq_cst_i8: %val = load atomic i8* @var8 seq_cst, align 1 ; CHECK-NOT: dmb ; CHECK: adrp [[HIADDR:x[0-9]+]], var8 @@ -957,7 +957,7 @@ define i8 @test_atomic_load_seq_cst_i8() nounwind { } define i16 @test_atomic_load_monotonic_i16() nounwind { -; CHECK: test_atomic_load_monotonic_i16: +; CHECK-LABEL: test_atomic_load_monotonic_i16: %val = load atomic i16* @var16 monotonic, align 2 ; CHECK-NOT: dmb ; CHECK: adrp x[[HIADDR:[0-9]+]], var16 @@ -969,7 +969,7 @@ define i16 @test_atomic_load_monotonic_i16() nounwind { } define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind { -; CHECK: test_atomic_load_monotonic_regoff_i32: +; CHECK-LABEL: test_atomic_load_monotonic_regoff_i32: %addr_int = add i64 %base, %off %addr = inttoptr i64 %addr_int to i32* @@ -982,7 +982,7 @@ define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind } define i64 @test_atomic_load_seq_cst_i64() nounwind { -; CHECK: test_atomic_load_seq_cst_i64: +; CHECK-LABEL: test_atomic_load_seq_cst_i64: %val = load atomic i64* @var64 seq_cst, align 8 ; CHECK-NOT: dmb ; CHECK: adrp [[HIADDR:x[0-9]+]], var64 @@ -995,7 +995,7 @@ define i64 @test_atomic_load_seq_cst_i64() nounwind { } define void @test_atomic_store_monotonic_i8(i8 %val) nounwind { -; CHECK: test_atomic_store_monotonic_i8: +; CHECK-LABEL: test_atomic_store_monotonic_i8: store atomic i8 %val, i8* @var8 monotonic, align 1 ; CHECK: adrp x[[HIADDR:[0-9]+]], var8 ; CHECK: strb w0, [x[[HIADDR]], #:lo12:var8] @@ -1004,7 +1004,7 @@ define void @test_atomic_store_monotonic_i8(i8 %val) nounwind { } define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) nounwind { -; CHECK: test_atomic_store_monotonic_regoff_i8: +; CHECK-LABEL: test_atomic_store_monotonic_regoff_i8: %addr_int = add i64 %base, %off %addr = inttoptr i64 %addr_int to i8* @@ -1015,7 +1015,7 @@ define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) ret void } define void @test_atomic_store_release_i8(i8 %val) nounwind { -; CHECK: test_atomic_store_release_i8: +; CHECK-LABEL: test_atomic_store_release_i8: store atomic i8 %val, i8* @var8 release, align 1 ; CHECK-NOT: dmb ; CHECK: adrp [[HIADDR:x[0-9]+]], var8 @@ -1028,7 +1028,7 @@ define void @test_atomic_store_release_i8(i8 %val) nounwind { } define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind { -; CHECK: test_atomic_store_seq_cst_i8: +; CHECK-LABEL: test_atomic_store_seq_cst_i8: store atomic i8 %val, i8* @var8 seq_cst, align 1 ; CHECK-NOT: dmb ; CHECK: adrp [[HIADDR:x[0-9]+]], var8 @@ -1042,7 +1042,7 @@ define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind { } define void @test_atomic_store_monotonic_i16(i16 %val) nounwind { -; CHECK: test_atomic_store_monotonic_i16: +; CHECK-LABEL: test_atomic_store_monotonic_i16: store atomic i16 %val, i16* @var16 monotonic, align 2 ; CHECK-NOT: dmb ; CHECK: adrp x[[HIADDR:[0-9]+]], var16 @@ -1053,7 +1053,7 @@ define void @test_atomic_store_monotonic_i16(i16 %val) nounwind { } define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %val) nounwind { -; CHECK: test_atomic_store_monotonic_regoff_i32: +; CHECK-LABEL: test_atomic_store_monotonic_regoff_i32: %addr_int = add i64 %base, %off %addr = inttoptr i64 %addr_int to i32* @@ -1067,7 +1067,7 @@ define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %va } define void @test_atomic_store_release_i64(i64 %val) nounwind { -; CHECK: test_atomic_store_release_i64: +; CHECK-LABEL: test_atomic_store_release_i64: store atomic i64 %val, i64* @var64 release, align 8 ; CHECK-NOT: dmb ; CHECK: adrp [[HIADDR:x[0-9]+]], var64 diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll index da94041c95ffc..682b7ba69d95c 100644 --- a/test/CodeGen/AArch64/basic-pic.ll +++ b/test/CodeGen/AArch64/basic-pic.ll @@ -1,70 +1,54 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s -; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -filetype=obj %s -o -| llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s @var = global i32 0 -; CHECK-ELF: RELOCATION RECORDS FOR [.text] - define i32 @get_globalvar() { -; CHECK: get_globalvar: +; CHECK-LABEL: get_globalvar: %val = load i32* @var ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var ; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], #:got_lo12:var] ; CHECK: ldr w0, [x[[GOTLOC]]] -; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var -; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var ret i32 %val } define i32* @get_globalvaraddr() { -; CHECK: get_globalvaraddr: +; CHECK-LABEL: get_globalvaraddr: %val = load i32* @var ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var ; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:var] -; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var -; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var ret i32* @var } @hiddenvar = hidden global i32 0 define i32 @get_hiddenvar() { -; CHECK: get_hiddenvar: +; CHECK-LABEL: get_hiddenvar: %val = load i32* @hiddenvar ; CHECK: adrp x[[HI:[0-9]+]], hiddenvar ; CHECK: ldr w0, [x[[HI]], #:lo12:hiddenvar] -; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar -; CHECK-ELF: R_AARCH64_LDST32_ABS_LO12_NC hiddenvar ret i32 %val } define i32* @get_hiddenvaraddr() { -; CHECK: get_hiddenvaraddr: +; CHECK-LABEL: get_hiddenvaraddr: %val = load i32* @hiddenvar ; CHECK: adrp [[HI:x[0-9]+]], hiddenvar ; CHECK: add x0, [[HI]], #:lo12:hiddenvar -; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar -; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC hiddenvar ret i32* @hiddenvar } define void()* @get_func() { -; CHECK: get_func: +; CHECK-LABEL: get_func: ret void()* bitcast(void()*()* @get_func to void()*) ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func ; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:get_func] - - ; Particularly important that the ADRP gets a relocation, LLVM tends to think - ; it can relax it because it knows where get_func is. It can't! -; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE get_func -; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC get_func -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/bitfield-insert-0.ll b/test/CodeGen/AArch64/bitfield-insert-0.ll index d1191f6aaa8a6..37a18b7fb613e 100644 --- a/test/CodeGen/AArch64/bitfield-insert-0.ll +++ b/test/CodeGen/AArch64/bitfield-insert-0.ll @@ -16,4 +16,4 @@ define void @test_bfi0(i32* %existing, i32* %new) { store volatile i32 %combined, i32* %existing ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll index 3e871b9a6d273..1f046087abc09 100644 --- a/test/CodeGen/AArch64/bitfield-insert.ll +++ b/test/CodeGen/AArch64/bitfield-insert.ll @@ -6,7 +6,7 @@ %struct.foo = type { i8, [2 x i8], i8 } define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone { -; CHECK: from_clang: +; CHECK-LABEL: from_clang: ; CHECK: bfi w0, w1, #3, #4 ; CHECK-NEXT: ret @@ -25,7 +25,7 @@ entry: } define void @test_whole32(i32* %existing, i32* %new) { -; CHECK: test_whole32: +; CHECK-LABEL: test_whole32: ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5 %oldval = load volatile i32* %existing @@ -42,7 +42,7 @@ define void @test_whole32(i32* %existing, i32* %new) { } define void @test_whole64(i64* %existing, i64* %new) { -; CHECK: test_whole64: +; CHECK-LABEL: test_whole64: ; CHECK: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14 ; CHECK-NOT: and ; CHECK: ret @@ -61,7 +61,7 @@ define void @test_whole64(i64* %existing, i64* %new) { } define void @test_whole32_from64(i64* %existing, i64* %new) { -; CHECK: test_whole32_from64: +; CHECK-LABEL: test_whole32_from64: ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16 ; CHECK-NOT: and ; CHECK: ret @@ -79,7 +79,7 @@ define void @test_whole32_from64(i64* %existing, i64* %new) { } define void @test_32bit_masked(i32 *%existing, i32 *%new) { -; CHECK: test_32bit_masked: +; CHECK-LABEL: test_32bit_masked: ; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4 ; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff @@ -97,7 +97,7 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) { } define void @test_64bit_masked(i64 *%existing, i64 *%new) { -; CHECK: test_64bit_masked: +; CHECK-LABEL: test_64bit_masked: ; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8 ; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000 @@ -116,7 +116,7 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) { ; Mask is too complicated for literal ANDwwi, make sure other avenues are tried. define void @test_32bit_complexmask(i32 *%existing, i32 *%new) { -; CHECK: test_32bit_complexmask: +; CHECK-LABEL: test_32bit_complexmask: ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} @@ -135,7 +135,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) { ; Neither mask is is a contiguous set of 1s. BFI can't be used define void @test_32bit_badmask(i32 *%existing, i32 *%new) { -; CHECK: test_32bit_badmask: +; CHECK-LABEL: test_32bit_badmask: ; CHECK-NOT: bfi ; CHECK: ret @@ -154,7 +154,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) { ; Ditto define void @test_64bit_badmask(i64 *%existing, i64 *%new) { -; CHECK: test_64bit_badmask: +; CHECK-LABEL: test_64bit_badmask: ; CHECK-NOT: bfi ; CHECK: ret @@ -174,7 +174,7 @@ define void @test_64bit_badmask(i64 *%existing, i64 *%new) { ; Bitfield insert where there's a left-over shr needed at the beginning ; (e.g. result of str.bf1 = str.bf2) define void @test_32bit_with_shr(i32* %existing, i32* %new) { -; CHECK: test_32bit_with_shr: +; CHECK-LABEL: test_32bit_with_shr: %oldval = load volatile i32* %existing %oldval_keep = and i32 %oldval, 2214592511 ; =0x83ffffff diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll index 36d337ef05ef5..1c84f5d578545 100644 --- a/test/CodeGen/AArch64/bitfield.ll +++ b/test/CodeGen/AArch64/bitfield.ll @@ -5,7 +5,7 @@ @var64 = global i64 0 define void @test_extendb(i8 %var) { -; CHECK: test_extendb: +; CHECK-LABEL: test_extendb: %sxt32 = sext i8 %var to i32 store volatile i32 %sxt32, i32* @var32 @@ -29,7 +29,7 @@ define void @test_extendb(i8 %var) { } define void @test_extendh(i16 %var) { -; CHECK: test_extendh: +; CHECK-LABEL: test_extendh: %sxt32 = sext i16 %var to i32 store volatile i32 %sxt32, i32* @var32 @@ -53,7 +53,7 @@ define void @test_extendh(i16 %var) { } define void @test_extendw(i32 %var) { -; CHECK: test_extendw: +; CHECK-LABEL: test_extendw: %sxt64 = sext i32 %var to i64 store volatile i64 %sxt64, i64* @var64 @@ -66,7 +66,7 @@ define void @test_extendw(i32 %var) { } define void @test_shifts(i32 %val32, i64 %val64) { -; CHECK: test_shifts: +; CHECK-LABEL: test_shifts: %shift1 = ashr i32 %val32, 31 store volatile i32 %shift1, i32* @var32 @@ -114,7 +114,7 @@ define void @test_shifts(i32 %val32, i64 %val64) { ; LLVM can produce in-register extensions taking place entirely with ; 64-bit registers too. define void @test_sext_inreg_64(i64 %in) { -; CHECK: test_sext_inreg_64: +; CHECK-LABEL: test_sext_inreg_64: ; i1 doesn't have an official alias, but crops up and is handled by ; the bitfield ops. @@ -143,7 +143,7 @@ define void @test_sext_inreg_64(i64 %in) { ; These instructions don't actually select to official bitfield ; operations, but it's important that we select them somehow: define void @test_zext_inreg_64(i64 %in) { -; CHECK: test_zext_inreg_64: +; CHECK-LABEL: test_zext_inreg_64: %trunc_i8 = trunc i64 %in to i8 %zext_i8 = zext i8 %trunc_i8 to i64 @@ -164,7 +164,7 @@ define void @test_zext_inreg_64(i64 %in) { } define i64 @test_sext_inreg_from_32(i32 %in) { -; CHECK: test_sext_inreg_from_32: +; CHECK-LABEL: test_sext_inreg_from_32: %small = trunc i32 %in to i1 %ext = sext i1 %small to i64 @@ -178,7 +178,7 @@ define i64 @test_sext_inreg_from_32(i32 %in) { define i32 @test_ubfx32(i32* %addr) { -; CHECK: test_ubfx32: +; CHECK-LABEL: test_ubfx32: ; CHECK: ubfx {{w[0-9]+}}, {{w[0-9]+}}, #23, #3 %fields = load i32* %addr @@ -188,7 +188,7 @@ define i32 @test_ubfx32(i32* %addr) { } define i64 @test_ubfx64(i64* %addr) { -; CHECK: test_ubfx64: +; CHECK-LABEL: test_ubfx64: ; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #25, #10 %fields = load i64* %addr @@ -198,7 +198,7 @@ define i64 @test_ubfx64(i64* %addr) { } define i32 @test_sbfx32(i32* %addr) { -; CHECK: test_sbfx32: +; CHECK-LABEL: test_sbfx32: ; CHECK: sbfx {{w[0-9]+}}, {{w[0-9]+}}, #6, #3 %fields = load i32* %addr @@ -208,7 +208,7 @@ define i32 @test_sbfx32(i32* %addr) { } define i64 @test_sbfx64(i64* %addr) { -; CHECK: test_sbfx64: +; CHECK-LABEL: test_sbfx64: ; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #63 %fields = load i64* %addr diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll index 5e85057a3c3b9..8cda431b8e928 100644 --- a/test/CodeGen/AArch64/blockaddress.ll +++ b/test/CodeGen/AArch64/blockaddress.ll @@ -4,7 +4,7 @@ @addr = global i8* null define void @test_blockaddress() { -; CHECK: test_blockaddress: +; CHECK-LABEL: test_blockaddress: store volatile i8* blockaddress(@test_blockaddress, %block), i8** @addr %val = load volatile i8** @addr indirectbr i8* %val, [label %block] diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll index 38ed4734e1b41..1ed5b9b755dd3 100644 --- a/test/CodeGen/AArch64/breg.ll +++ b/test/CodeGen/AArch64/breg.ll @@ -3,7 +3,7 @@ @stored_label = global i8* null define void @foo() { -; CHECK: foo: +; CHECK-LABEL: foo: %lab = load i8** @stored_label indirectbr i8* %lab, [label %otherlab, label %retlab] ; CHECK: adrp {{x[0-9]+}}, stored_label diff --git a/test/CodeGen/AArch64/callee-save.ll b/test/CodeGen/AArch64/callee-save.ll index c66aa5bfc5101..52243b05b4b9a 100644 --- a/test/CodeGen/AArch64/callee-save.ll +++ b/test/CodeGen/AArch64/callee-save.ll @@ -3,7 +3,7 @@ @var = global float 0.0 define void @foo() { -; CHECK: foo: +; CHECK-LABEL: foo: ; CHECK: stp d14, d15, [sp ; CHECK: stp d12, d13, [sp diff --git a/test/CodeGen/AArch64/code-model-large-abs.ll b/test/CodeGen/AArch64/code-model-large-abs.ll index a365568e11eea..b387f285d1d4d 100644 --- a/test/CodeGen/AArch64/code-model-large-abs.ll +++ b/test/CodeGen/AArch64/code-model-large-abs.ll @@ -6,7 +6,7 @@ @var64 = global i64 0 define i8* @global_addr() { -; CHECK: global_addr: +; CHECK-LABEL: global_addr: ret i8* @var8 ; The movz/movk calculation should end up returned directly in x0. ; CHECK: movz x0, #:abs_g3:var8 @@ -17,7 +17,7 @@ define i8* @global_addr() { } define i8 @global_i8() { -; CHECK: global_i8: +; CHECK-LABEL: global_i8: %val = load i8* @var8 ret i8 %val ; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8 @@ -28,7 +28,7 @@ define i8 @global_i8() { } define i16 @global_i16() { -; CHECK: global_i16: +; CHECK-LABEL: global_i16: %val = load i16* @var16 ret i16 %val ; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16 @@ -39,7 +39,7 @@ define i16 @global_i16() { } define i32 @global_i32() { -; CHECK: global_i32: +; CHECK-LABEL: global_i32: %val = load i32* @var32 ret i32 %val ; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32 @@ -50,7 +50,7 @@ define i32 @global_i32() { } define i64 @global_i64() { -; CHECK: global_i64: +; CHECK-LABEL: global_i64: %val = load i64* @var64 ret i64 %val ; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64 diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll index 4213110497d3b..75efd9d4a0d6b 100644 --- a/test/CodeGen/AArch64/compare-branch.ll +++ b/test/CodeGen/AArch64/compare-branch.ll @@ -4,7 +4,7 @@ @var64 = global i64 0 define void @foo() { -; CHECK: foo: +; CHECK-LABEL: foo: %val1 = load volatile i32* @var32 %tst1 = icmp eq i32 %val1, 0 @@ -35,4 +35,4 @@ test5: end: ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/complex-copy-noneon.ll b/test/CodeGen/AArch64/complex-copy-noneon.ll new file mode 100644 index 0000000000000..4ae547856ecd3 --- /dev/null +++ b/test/CodeGen/AArch64/complex-copy-noneon.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s + +; The DAG combiner decided to use a vector load/store for this struct copy +; previously. This probably shouldn't happen without NEON, but the most +; important thing is that it compiles. + +define void @store_combine() nounwind { + %src = alloca { double, double }, align 8 + %dst = alloca { double, double }, align 8 + + %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0 + %src.real = load double* %src.realp + %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1 + %src.imag = load double* %src.imagp + + %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1 + store double %src.real, double* %dst.realp + store double %src.imag, double* %dst.imagp + ret void +} diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll index 3051cf53fdf8f..9c1dfeb3c8d3f 100644 --- a/test/CodeGen/AArch64/cond-sel.ll +++ b/test/CodeGen/AArch64/cond-sel.ll @@ -1,24 +1,25 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s @var32 = global i32 0 @var64 = global i64 0 define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) { -; CHECK: test_csel: +; CHECK-LABEL: test_csel: %tst1 = icmp ugt i32 %lhs32, %rhs32 %val1 = select i1 %tst1, i32 42, i32 52 store i32 %val1, i32* @var32 -; CHECK: movz [[W52:w[0-9]+]], #52 -; CHECK: movz [[W42:w[0-9]+]], #42 +; CHECK-DAG: movz [[W52:w[0-9]+]], #52 +; CHECK-DAG: movz [[W42:w[0-9]+]], #42 ; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi %rhs64 = sext i32 %rhs32 to i64 %tst2 = icmp sle i64 %lhs64, %rhs64 %val2 = select i1 %tst2, i64 %lhs64, i64 %rhs64 store i64 %val2, i64* @var64 -; CHECK: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw -; CHECK: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]] +; CHECK-DAG: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw +; CHECK-DAG: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]] ; CHECK: csel {{x[0-9]+}}, [[LHS]], [[EXT_RHS]], le ret void @@ -26,10 +27,11 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) { } define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %rhs64) { -; CHECK: test_floatcsel: +; CHECK-LABEL: test_floatcsel: %tst1 = fcmp one float %lhs32, %rhs32 ; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFP-NOT: fcmp %val1 = select i1 %tst1, i32 42, i32 52 store i32 %val1, i32* @var32 ; CHECK: movz [[W52:w[0-9]+]], #52 @@ -40,6 +42,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r %tst2 = fcmp ueq double %lhs64, %rhs64 ; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFP-NOT: fcmp %val2 = select i1 %tst2, i64 9, i64 15 store i64 %val2, i64* @var64 ; CHECK: movz [[CONST15:x[0-9]+]], #15 @@ -53,7 +56,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) { -; CHECK: test_csinc: +; CHECK-LABEL: test_csinc: ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls). %tst1 = icmp ugt i32 %lhs32, %rhs32 @@ -93,7 +96,7 @@ define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) { } define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) { -; CHECK: test_csinv: +; CHECK-LABEL: test_csinv: ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls). %tst1 = icmp ugt i32 %lhs32, %rhs32 @@ -133,7 +136,7 @@ define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) { } define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) { -; CHECK: test_csneg: +; CHECK-LABEL: test_csneg: ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls). %tst1 = icmp ugt i32 %lhs32, %rhs32 @@ -173,7 +176,7 @@ define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) { } define void @test_cset(i32 %lhs, i32 %rhs, i64 %lhs64) { -; CHECK: test_cset: +; CHECK-LABEL: test_cset: ; N.b. code is not optimal here (32-bit csinc would be better) but ; incoming DAG is too complex @@ -194,7 +197,7 @@ define void @test_cset(i32 %lhs, i32 %rhs, i64 %lhs64) { } define void @test_csetm(i32 %lhs, i32 %rhs, i64 %lhs64) { -; CHECK: test_csetm: +; CHECK-LABEL: test_csetm: %tst1 = icmp eq i32 %lhs, %rhs %val1 = sext i1 %tst1 to i32 diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll index f5d57593bfad0..12c7b6aed6430 100644 --- a/test/CodeGen/AArch64/directcond.ll +++ b/test/CodeGen/AArch64/directcond.ll @@ -1,7 +1,8 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) { -; CHECK: test_select_i32: +; CHECK-LABEL: test_select_i32: %val = select i1 %bit, i32 %a, i32 %b ; CHECK: movz [[ONE:w[0-9]+]], #1 ; CHECK: tst w0, [[ONE]] @@ -11,7 +12,7 @@ define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) { } define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) { -; CHECK: test_select_i64: +; CHECK-LABEL: test_select_i64: %val = select i1 %bit, i64 %a, i64 %b ; CHECK: movz [[ONE:w[0-9]+]], #1 ; CHECK: tst w0, [[ONE]] @@ -21,27 +22,28 @@ define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) { } define float @test_select_float(i1 %bit, float %a, float %b) { -; CHECK: test_select_float: +; CHECK-LABEL: test_select_float: %val = select i1 %bit, float %a, float %b ; CHECK: movz [[ONE:w[0-9]+]], #1 ; CHECK: tst w0, [[ONE]] ; CHECK-NEXT: fcsel s0, s0, s1, ne - +; CHECK-NOFP-NOT: fcsel ret float %val } define double @test_select_double(i1 %bit, double %a, double %b) { -; CHECK: test_select_double: +; CHECK-LABEL: test_select_double: %val = select i1 %bit, double %a, double %b ; CHECK: movz [[ONE:w[0-9]+]], #1 ; CHECK: tst w0, [[ONE]] ; CHECK-NEXT: fcsel d0, d0, d1, ne +; CHECK-NOFP-NOT: fcsel ret double %val } define i32 @test_brcond(i1 %bit) { -; CHECK: test_brcond: +; CHECK-LABEL: test_brcond: br i1 %bit, label %true, label %false ; CHECK: tbz {{w[0-9]+}}, #0, .LBB @@ -56,6 +58,7 @@ define i1 @test_setcc_float(float %lhs, float %rhs) { %val = fcmp oeq float %lhs, %rhs ; CHECK: fcmp s0, s1 ; CHECK: csinc w0, wzr, wzr, ne +; CHECK-NOFP-NOT: fcmp ret i1 %val } @@ -64,6 +67,7 @@ define i1 @test_setcc_double(double %lhs, double %rhs) { %val = fcmp oeq double %lhs, %rhs ; CHECK: fcmp d0, d1 ; CHECK: csinc w0, wzr, wzr, ne +; CHECK-NOFP-NOT: fcmp ret i1 %val } diff --git a/test/CodeGen/AArch64/dp-3source.ll b/test/CodeGen/AArch64/dp-3source.ll index c40d3933b44bb..81d9e15532fa9 100644 --- a/test/CodeGen/AArch64/dp-3source.ll +++ b/test/CodeGen/AArch64/dp-3source.ll @@ -1,7 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) { -; CHECK: test_madd32: +; CHECK-LABEL: test_madd32: %mid = mul i32 %val1, %val2 %res = add i32 %val0, %mid ; CHECK: madd {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} @@ -9,7 +9,7 @@ define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) { } define i64 @test_madd64(i64 %val0, i64 %val1, i64 %val2) { -; CHECK: test_madd64: +; CHECK-LABEL: test_madd64: %mid = mul i64 %val1, %val2 %res = add i64 %val0, %mid ; CHECK: madd {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} @@ -17,7 +17,7 @@ define i64 @test_madd64(i64 %val0, i64 %val1, i64 %val2) { } define i32 @test_msub32(i32 %val0, i32 %val1, i32 %val2) { -; CHECK: test_msub32: +; CHECK-LABEL: test_msub32: %mid = mul i32 %val1, %val2 %res = sub i32 %val0, %mid ; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} @@ -25,7 +25,7 @@ define i32 @test_msub32(i32 %val0, i32 %val1, i32 %val2) { } define i64 @test_msub64(i64 %val0, i64 %val1, i64 %val2) { -; CHECK: test_msub64: +; CHECK-LABEL: test_msub64: %mid = mul i64 %val1, %val2 %res = sub i64 %val0, %mid ; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} @@ -33,7 +33,7 @@ define i64 @test_msub64(i64 %val0, i64 %val1, i64 %val2) { } define i64 @test_smaddl(i64 %acc, i32 %val1, i32 %val2) { -; CHECK: test_smaddl: +; CHECK-LABEL: test_smaddl: %ext1 = sext i32 %val1 to i64 %ext2 = sext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 @@ -43,7 +43,7 @@ define i64 @test_smaddl(i64 %acc, i32 %val1, i32 %val2) { } define i64 @test_smsubl(i64 %acc, i32 %val1, i32 %val2) { -; CHECK: test_smsubl: +; CHECK-LABEL: test_smsubl: %ext1 = sext i32 %val1 to i64 %ext2 = sext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 @@ -53,7 +53,7 @@ define i64 @test_smsubl(i64 %acc, i32 %val1, i32 %val2) { } define i64 @test_umaddl(i64 %acc, i32 %val1, i32 %val2) { -; CHECK: test_umaddl: +; CHECK-LABEL: test_umaddl: %ext1 = zext i32 %val1 to i64 %ext2 = zext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 @@ -63,7 +63,7 @@ define i64 @test_umaddl(i64 %acc, i32 %val1, i32 %val2) { } define i64 @test_umsubl(i64 %acc, i32 %val1, i32 %val2) { -; CHECK: test_umsubl: +; CHECK-LABEL: test_umsubl: %ext1 = zext i32 %val1 to i64 %ext2 = zext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 @@ -73,7 +73,7 @@ define i64 @test_umsubl(i64 %acc, i32 %val1, i32 %val2) { } define i64 @test_smulh(i64 %lhs, i64 %rhs) { -; CHECK: test_smulh: +; CHECK-LABEL: test_smulh: %ext1 = sext i64 %lhs to i128 %ext2 = sext i64 %rhs to i128 %res = mul i128 %ext1, %ext2 @@ -84,7 +84,7 @@ define i64 @test_smulh(i64 %lhs, i64 %rhs) { } define i64 @test_umulh(i64 %lhs, i64 %rhs) { -; CHECK: test_umulh: +; CHECK-LABEL: test_umulh: %ext1 = zext i64 %lhs to i128 %ext2 = zext i64 %rhs to i128 %res = mul i128 %ext1, %ext2 @@ -95,21 +95,21 @@ define i64 @test_umulh(i64 %lhs, i64 %rhs) { } define i32 @test_mul32(i32 %lhs, i32 %rhs) { -; CHECK: test_mul32: +; CHECK-LABEL: test_mul32: %res = mul i32 %lhs, %rhs ; CHECK: mul {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i32 %res } define i64 @test_mul64(i64 %lhs, i64 %rhs) { -; CHECK: test_mul64: +; CHECK-LABEL: test_mul64: %res = mul i64 %lhs, %rhs ; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i32 @test_mneg32(i32 %lhs, i32 %rhs) { -; CHECK: test_mneg32: +; CHECK-LABEL: test_mneg32: %prod = mul i32 %lhs, %rhs %res = sub i32 0, %prod ; CHECK: mneg {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} @@ -117,7 +117,7 @@ define i32 @test_mneg32(i32 %lhs, i32 %rhs) { } define i64 @test_mneg64(i64 %lhs, i64 %rhs) { -; CHECK: test_mneg64: +; CHECK-LABEL: test_mneg64: %prod = mul i64 %lhs, %rhs %res = sub i64 0, %prod ; CHECK: mneg {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} @@ -125,7 +125,7 @@ define i64 @test_mneg64(i64 %lhs, i64 %rhs) { } define i64 @test_smull(i32 %lhs, i32 %rhs) { -; CHECK: test_smull: +; CHECK-LABEL: test_smull: %ext1 = sext i32 %lhs to i64 %ext2 = sext i32 %rhs to i64 %res = mul i64 %ext1, %ext2 @@ -134,7 +134,7 @@ define i64 @test_smull(i32 %lhs, i32 %rhs) { } define i64 @test_umull(i32 %lhs, i32 %rhs) { -; CHECK: test_umull: +; CHECK-LABEL: test_umull: %ext1 = zext i32 %lhs to i64 %ext2 = zext i32 %rhs to i64 %res = mul i64 %ext1, %ext2 @@ -143,7 +143,7 @@ define i64 @test_umull(i32 %lhs, i32 %rhs) { } define i64 @test_smnegl(i32 %lhs, i32 %rhs) { -; CHECK: test_smnegl: +; CHECK-LABEL: test_smnegl: %ext1 = sext i32 %lhs to i64 %ext2 = sext i32 %rhs to i64 %prod = mul i64 %ext1, %ext2 @@ -153,7 +153,7 @@ define i64 @test_smnegl(i32 %lhs, i32 %rhs) { } define i64 @test_umnegl(i32 %lhs, i32 %rhs) { -; CHECK: test_umnegl: +; CHECK-LABEL: test_umnegl: %ext1 = zext i32 %lhs to i64 %ext2 = zext i32 %rhs to i64 %prod = mul i64 %ext1, %ext2 diff --git a/test/CodeGen/AArch64/dp1.ll b/test/CodeGen/AArch64/dp1.ll index 83aa8b4f6631e..6a8d55cdc7ea0 100644 --- a/test/CodeGen/AArch64/dp1.ll +++ b/test/CodeGen/AArch64/dp1.ll @@ -4,7 +4,7 @@ @var64 = global i64 0 define void @rev_i32() { -; CHECK: rev_i32: +; CHECK-LABEL: rev_i32: %val0_tmp = load i32* @var32 %val1_tmp = call i32 @llvm.bswap.i32(i32 %val0_tmp) ; CHECK: rev {{w[0-9]+}}, {{w[0-9]+}} @@ -13,7 +13,7 @@ define void @rev_i32() { } define void @rev_i64() { -; CHECK: rev_i64: +; CHECK-LABEL: rev_i64: %val0_tmp = load i64* @var64 %val1_tmp = call i64 @llvm.bswap.i64(i64 %val0_tmp) ; CHECK: rev {{x[0-9]+}}, {{x[0-9]+}} @@ -22,7 +22,7 @@ define void @rev_i64() { } define void @rev32_i64() { -; CHECK: rev32_i64: +; CHECK-LABEL: rev32_i64: %val0_tmp = load i64* @var64 %val1_tmp = shl i64 %val0_tmp, 32 %val5_tmp = sub i64 64, 32 @@ -35,7 +35,7 @@ define void @rev32_i64() { } define void @rev16_i32() { -; CHECK: rev16_i32: +; CHECK-LABEL: rev16_i32: %val0_tmp = load i32* @var32 %val1_tmp = shl i32 %val0_tmp, 16 %val2_tmp = lshr i32 %val0_tmp, 16 @@ -47,7 +47,7 @@ define void @rev16_i32() { } define void @clz_zerodef_i32() { -; CHECK: clz_zerodef_i32: +; CHECK-LABEL: clz_zerodef_i32: %val0_tmp = load i32* @var32 %val4_tmp = call i32 @llvm.ctlz.i32(i32 %val0_tmp, i1 0) ; CHECK: clz {{w[0-9]+}}, {{w[0-9]+}} @@ -56,7 +56,7 @@ define void @clz_zerodef_i32() { } define void @clz_zerodef_i64() { -; CHECK: clz_zerodef_i64: +; CHECK-LABEL: clz_zerodef_i64: %val0_tmp = load i64* @var64 %val4_tmp = call i64 @llvm.ctlz.i64(i64 %val0_tmp, i1 0) ; CHECK: clz {{x[0-9]+}}, {{x[0-9]+}} @@ -65,7 +65,7 @@ define void @clz_zerodef_i64() { } define void @clz_zeroundef_i32() { -; CHECK: clz_zeroundef_i32: +; CHECK-LABEL: clz_zeroundef_i32: %val0_tmp = load i32* @var32 %val4_tmp = call i32 @llvm.ctlz.i32(i32 %val0_tmp, i1 1) ; CHECK: clz {{w[0-9]+}}, {{w[0-9]+}} @@ -74,7 +74,7 @@ define void @clz_zeroundef_i32() { } define void @clz_zeroundef_i64() { -; CHECK: clz_zeroundef_i64: +; CHECK-LABEL: clz_zeroundef_i64: %val0_tmp = load i64* @var64 %val4_tmp = call i64 @llvm.ctlz.i64(i64 %val0_tmp, i1 1) ; CHECK: clz {{x[0-9]+}}, {{x[0-9]+}} @@ -83,7 +83,7 @@ define void @clz_zeroundef_i64() { } define void @cttz_zerodef_i32() { -; CHECK: cttz_zerodef_i32: +; CHECK-LABEL: cttz_zerodef_i32: %val0_tmp = load i32* @var32 %val4_tmp = call i32 @llvm.cttz.i32(i32 %val0_tmp, i1 0) ; CHECK: rbit [[REVERSED:w[0-9]+]], {{w[0-9]+}} @@ -93,7 +93,7 @@ define void @cttz_zerodef_i32() { } define void @cttz_zerodef_i64() { -; CHECK: cttz_zerodef_i64: +; CHECK-LABEL: cttz_zerodef_i64: %val0_tmp = load i64* @var64 %val4_tmp = call i64 @llvm.cttz.i64(i64 %val0_tmp, i1 0) ; CHECK: rbit [[REVERSED:x[0-9]+]], {{x[0-9]+}} @@ -103,7 +103,7 @@ define void @cttz_zerodef_i64() { } define void @cttz_zeroundef_i32() { -; CHECK: cttz_zeroundef_i32: +; CHECK-LABEL: cttz_zeroundef_i32: %val0_tmp = load i32* @var32 %val4_tmp = call i32 @llvm.cttz.i32(i32 %val0_tmp, i1 1) ; CHECK: rbit [[REVERSED:w[0-9]+]], {{w[0-9]+}} @@ -113,7 +113,7 @@ define void @cttz_zeroundef_i32() { } define void @cttz_zeroundef_i64() { -; CHECK: cttz_zeroundef_i64: +; CHECK-LABEL: cttz_zeroundef_i64: %val0_tmp = load i64* @var64 %val4_tmp = call i64 @llvm.cttz.i64(i64 %val0_tmp, i1 1) ; CHECK: rbit [[REVERSED:x[0-9]+]], {{x[0-9]+}} @@ -125,7 +125,7 @@ define void @cttz_zeroundef_i64() { ; These two are just compilation tests really: the operation's set to Expand in ; ISelLowering. define void @ctpop_i32() { -; CHECK: ctpop_i32: +; CHECK-LABEL: ctpop_i32: %val0_tmp = load i32* @var32 %val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp) store volatile i32 %val4_tmp, i32* @var32 @@ -133,7 +133,7 @@ define void @ctpop_i32() { } define void @ctpop_i64() { -; CHECK: ctpop_i64: +; CHECK-LABEL: ctpop_i64: %val0_tmp = load i64* @var64 %val4_tmp = call i64 @llvm.ctpop.i64(i64 %val0_tmp) store volatile i64 %val4_tmp, i64* @var64 diff --git a/test/CodeGen/AArch64/dp2.ll b/test/CodeGen/AArch64/dp2.ll index 4c740f6b86238..48b0701ad1faa 100644 --- a/test/CodeGen/AArch64/dp2.ll +++ b/test/CodeGen/AArch64/dp2.ll @@ -6,7 +6,7 @@ @var64_1 = global i64 0 define void @rorv_i64() { -; CHECK: rorv_i64: +; CHECK-LABEL: rorv_i64: %val0_tmp = load i64* @var64_0 %val1_tmp = load i64* @var64_1 %val2_tmp = sub i64 64, %val1_tmp @@ -19,7 +19,7 @@ define void @rorv_i64() { } define void @asrv_i64() { -; CHECK: asrv_i64: +; CHECK-LABEL: asrv_i64: %val0_tmp = load i64* @var64_0 %val1_tmp = load i64* @var64_1 %val4_tmp = ashr i64 %val0_tmp, %val1_tmp @@ -29,7 +29,7 @@ define void @asrv_i64() { } define void @lsrv_i64() { -; CHECK: lsrv_i64: +; CHECK-LABEL: lsrv_i64: %val0_tmp = load i64* @var64_0 %val1_tmp = load i64* @var64_1 %val4_tmp = lshr i64 %val0_tmp, %val1_tmp @@ -39,7 +39,7 @@ define void @lsrv_i64() { } define void @lslv_i64() { -; CHECK: lslv_i64: +; CHECK-LABEL: lslv_i64: %val0_tmp = load i64* @var64_0 %val1_tmp = load i64* @var64_1 %val4_tmp = shl i64 %val0_tmp, %val1_tmp @@ -49,7 +49,7 @@ define void @lslv_i64() { } define void @udiv_i64() { -; CHECK: udiv_i64: +; CHECK-LABEL: udiv_i64: %val0_tmp = load i64* @var64_0 %val1_tmp = load i64* @var64_1 %val4_tmp = udiv i64 %val0_tmp, %val1_tmp @@ -59,7 +59,7 @@ define void @udiv_i64() { } define void @sdiv_i64() { -; CHECK: sdiv_i64: +; CHECK-LABEL: sdiv_i64: %val0_tmp = load i64* @var64_0 %val1_tmp = load i64* @var64_1 %val4_tmp = sdiv i64 %val0_tmp, %val1_tmp @@ -70,7 +70,7 @@ define void @sdiv_i64() { define void @lsrv_i32() { -; CHECK: lsrv_i32: +; CHECK-LABEL: lsrv_i32: %val0_tmp = load i32* @var32_0 %val1_tmp = load i32* @var32_1 %val2_tmp = add i32 1, %val1_tmp @@ -81,7 +81,7 @@ define void @lsrv_i32() { } define void @lslv_i32() { -; CHECK: lslv_i32: +; CHECK-LABEL: lslv_i32: %val0_tmp = load i32* @var32_0 %val1_tmp = load i32* @var32_1 %val2_tmp = add i32 1, %val1_tmp @@ -92,7 +92,7 @@ define void @lslv_i32() { } define void @rorv_i32() { -; CHECK: rorv_i32: +; CHECK-LABEL: rorv_i32: %val0_tmp = load i32* @var32_0 %val6_tmp = load i32* @var32_1 %val1_tmp = add i32 1, %val6_tmp @@ -106,7 +106,7 @@ define void @rorv_i32() { } define void @asrv_i32() { -; CHECK: asrv_i32: +; CHECK-LABEL: asrv_i32: %val0_tmp = load i32* @var32_0 %val1_tmp = load i32* @var32_1 %val2_tmp = add i32 1, %val1_tmp @@ -117,7 +117,7 @@ define void @asrv_i32() { } define void @sdiv_i32() { -; CHECK: sdiv_i32: +; CHECK-LABEL: sdiv_i32: %val0_tmp = load i32* @var32_0 %val1_tmp = load i32* @var32_1 %val4_tmp = sdiv i32 %val0_tmp, %val1_tmp @@ -127,7 +127,7 @@ define void @sdiv_i32() { } define void @udiv_i32() { -; CHECK: udiv_i32: +; CHECK-LABEL: udiv_i32: %val0_tmp = load i32* @var32_0 %val1_tmp = load i32* @var32_1 %val4_tmp = udiv i32 %val0_tmp, %val1_tmp @@ -139,7 +139,7 @@ define void @udiv_i32() { ; The point of this test is that we may not actually see (shl GPR32:$Val, (zext GPR32:$Val2)) ; in the DAG (the RHS may be natively 64-bit), but we should still use the lsl instructions. define i32 @test_lsl32() { -; CHECK: test_lsl32: +; CHECK-LABEL: test_lsl32: %val = load i32* @var32_0 %ret = shl i32 1, %val @@ -149,7 +149,7 @@ define i32 @test_lsl32() { } define i32 @test_lsr32() { -; CHECK: test_lsr32: +; CHECK-LABEL: test_lsr32: %val = load i32* @var32_0 %ret = lshr i32 1, %val @@ -159,7 +159,7 @@ define i32 @test_lsr32() { } define i32 @test_asr32(i32 %in) { -; CHECK: test_asr32: +; CHECK-LABEL: test_asr32: %val = load i32* @var32_0 %ret = ashr i32 %in, %val diff --git a/test/CodeGen/AArch64/elf-extern.ll b/test/CodeGen/AArch64/elf-extern.ll deleted file mode 100644 index 8bf1b2ff4fa9b..0000000000000 --- a/test/CodeGen/AArch64/elf-extern.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s - -; External symbols are a different concept to global variables but should still -; get relocations and so on when used. - -declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1) - -define i32 @check_extern() { - call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* undef, i32 undef, i32 4, i1 0) - ret i32 0 -} - -; CHECK: Relocations [ -; CHECK: Section (1) .text { -; CHECK: 0x{{[0-9,A-F]+}} R_AARCH64_CALL26 memcpy -; CHECK: } -; CHECK: ] diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll index bc0acc2533881..322b3f4522d61 100644 --- a/test/CodeGen/AArch64/extern-weak.ll +++ b/test/CodeGen/AArch64/extern-weak.ll @@ -51,4 +51,4 @@ define i32* @wibble() { ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var ; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/extract.ll b/test/CodeGen/AArch64/extract.ll index 06267816a4e18..62d9ed2fc9d9f 100644 --- a/test/CodeGen/AArch64/extract.ll +++ b/test/CodeGen/AArch64/extract.ll @@ -1,7 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s define i64 @ror_i64(i64 %in) { -; CHECK: ror_i64: +; CHECK-LABEL: ror_i64: %left = shl i64 %in, 19 %right = lshr i64 %in, 45 %val5 = or i64 %left, %right @@ -10,7 +10,7 @@ define i64 @ror_i64(i64 %in) { } define i32 @ror_i32(i32 %in) { -; CHECK: ror_i32: +; CHECK-LABEL: ror_i32: %left = shl i32 %in, 9 %right = lshr i32 %in, 23 %val5 = or i32 %left, %right @@ -19,7 +19,7 @@ define i32 @ror_i32(i32 %in) { } define i32 @extr_i32(i32 %lhs, i32 %rhs) { -; CHECK: extr_i32: +; CHECK-LABEL: extr_i32: %left = shl i32 %lhs, 6 %right = lshr i32 %rhs, 26 %val = or i32 %left, %right @@ -31,7 +31,7 @@ define i32 @extr_i32(i32 %lhs, i32 %rhs) { } define i64 @extr_i64(i64 %lhs, i64 %rhs) { -; CHECK: extr_i64: +; CHECK-LABEL: extr_i64: %right = lshr i64 %rhs, 40 %left = shl i64 %lhs, 24 %val = or i64 %right, %left @@ -45,7 +45,7 @@ define i64 @extr_i64(i64 %lhs, i64 %rhs) { ; Regression test: a bad experimental pattern crept into git which optimised ; this pattern to a single EXTR. define i32 @extr_regress(i32 %a, i32 %b) { -; CHECK: extr_regress: +; CHECK-LABEL: extr_regress: %sh1 = shl i32 %a, 14 %sh2 = lshr i32 %b, 14 diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll index e40aa3033bde5..c6c050570dd6f 100644 --- a/test/CodeGen/AArch64/fastcc-reserved.ll +++ b/test/CodeGen/AArch64/fastcc-reserved.ll @@ -7,7 +7,7 @@ declare fastcc void @will_pop([8 x i32], i32 %val) define fastcc void @foo(i32 %in) { -; CHECK: foo: +; CHECK-LABEL: foo: %addr = alloca i8, i32 %in @@ -34,7 +34,7 @@ define fastcc void @foo(i32 %in) { declare void @wont_pop([8 x i32], i32 %val) define void @foo1(i32 %in) { -; CHECK: foo1: +; CHECK-LABEL: foo1: %addr = alloca i8, i32 %in ; Normal frame setup again diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll index 41cde94edc1c3..a4cd37858ee4a 100644 --- a/test/CodeGen/AArch64/fastcc.ll +++ b/test/CodeGen/AArch64/fastcc.ll @@ -5,10 +5,10 @@ ; stack, so try to make sure this is respected. define fastcc void @func_stack0() { -; CHECK: func_stack0: +; CHECK-LABEL: func_stack0: ; CHECK: sub sp, sp, #48 -; CHECK-TAIL: func_stack0: +; CHECK-TAIL-LABEL: func_stack0: ; CHECK-TAIL: sub sp, sp, #48 @@ -45,10 +45,10 @@ define fastcc void @func_stack0() { } define fastcc void @func_stack8([8 x i32], i32 %stacked) { -; CHECK: func_stack8: +; CHECK-LABEL: func_stack8: ; CHECK: sub sp, sp, #48 -; CHECK-TAIL: func_stack8: +; CHECK-TAIL-LABEL: func_stack8: ; CHECK-TAIL: sub sp, sp, #48 @@ -84,10 +84,10 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) { } define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) { -; CHECK: func_stack32: +; CHECK-LABEL: func_stack32: ; CHECK: sub sp, sp, #48 -; CHECK-TAIL: func_stack32: +; CHECK-TAIL-LABEL: func_stack32: ; CHECK-TAIL: sub sp, sp, #48 diff --git a/test/CodeGen/AArch64/fcmp.ll b/test/CodeGen/AArch64/fcmp.ll index ad4a903c9b250..a9518eabb754c 100644 --- a/test/CodeGen/AArch64/fcmp.ll +++ b/test/CodeGen/AArch64/fcmp.ll @@ -3,7 +3,7 @@ declare void @bar(i32) define void @test_float(float %a, float %b) { -; CHECK: test_float: +; CHECK-LABEL: test_float: %tst1 = fcmp oeq float %a, %b br i1 %tst1, label %end, label %t2 @@ -42,7 +42,7 @@ end: } define void @test_double(double %a, double %b) { -; CHECK: test_double: +; CHECK-LABEL: test_double: %tst1 = fcmp oeq double %a, %b br i1 %tst1, label %end, label %t2 diff --git a/test/CodeGen/AArch64/fcvt-fixed.ll b/test/CodeGen/AArch64/fcvt-fixed.ll index 0f7b95b2a48f8..9d66da49437b3 100644 --- a/test/CodeGen/AArch64/fcvt-fixed.ll +++ b/test/CodeGen/AArch64/fcvt-fixed.ll @@ -4,7 +4,7 @@ @var64 = global i64 0 define void @test_fcvtzs(float %flt, double %dbl) { -; CHECK: test_fcvtzs: +; CHECK-LABEL: test_fcvtzs: %fix1 = fmul float %flt, 128.0 %cvt1 = fptosi float %fix1 to i32 @@ -50,7 +50,7 @@ define void @test_fcvtzs(float %flt, double %dbl) { } define void @test_fcvtzu(float %flt, double %dbl) { -; CHECK: test_fcvtzu: +; CHECK-LABEL: test_fcvtzu: %fix1 = fmul float %flt, 128.0 %cvt1 = fptoui float %fix1 to i32 @@ -99,7 +99,7 @@ define void @test_fcvtzu(float %flt, double %dbl) { @vardouble = global double 0.0 define void @test_scvtf(i32 %int, i64 %long) { -; CHECK: test_scvtf: +; CHECK-LABEL: test_scvtf: %cvt1 = sitofp i32 %int to float %fix1 = fdiv float %cvt1, 128.0 @@ -145,7 +145,7 @@ define void @test_scvtf(i32 %int, i64 %long) { } define void @test_ucvtf(i32 %int, i64 %long) { -; CHECK: test_ucvtf: +; CHECK-LABEL: test_ucvtf: %cvt1 = uitofp i32 %int to float %fix1 = fdiv float %cvt1, 128.0 diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll index c771d683a99c1..b28eb3ea1bef3 100644 --- a/test/CodeGen/AArch64/fcvt-int.ll +++ b/test/CodeGen/AArch64/fcvt-int.ll @@ -1,12 +1,12 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s define i32 @test_floattoi32(float %in) { -; CHECK: test_floattoi32: +; CHECK-LABEL: test_floattoi32: %signed = fptosi float %in to i32 %unsigned = fptoui float %in to i32 -; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{s[0-9]+}} -; CHECK: fcvtzs [[SIG:w[0-9]+]], {{s[0-9]+}} +; CHECK-DAG: fcvtzu [[UNSIG:w[0-9]+]], {{s[0-9]+}} +; CHECK-DAG: fcvtzs [[SIG:w[0-9]+]], {{s[0-9]+}} %res = sub i32 %signed, %unsigned ; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]] @@ -16,12 +16,12 @@ define i32 @test_floattoi32(float %in) { } define i32 @test_doubletoi32(double %in) { -; CHECK: test_doubletoi32: +; CHECK-LABEL: test_doubletoi32: %signed = fptosi double %in to i32 %unsigned = fptoui double %in to i32 -; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{d[0-9]+}} -; CHECK: fcvtzs [[SIG:w[0-9]+]], {{d[0-9]+}} +; CHECK-DAG: fcvtzu [[UNSIG:w[0-9]+]], {{d[0-9]+}} +; CHECK-DAG: fcvtzs [[SIG:w[0-9]+]], {{d[0-9]+}} %res = sub i32 %signed, %unsigned ; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]] @@ -31,12 +31,12 @@ define i32 @test_doubletoi32(double %in) { } define i64 @test_floattoi64(float %in) { -; CHECK: test_floattoi64: +; CHECK-LABEL: test_floattoi64: %signed = fptosi float %in to i64 %unsigned = fptoui float %in to i64 -; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{s[0-9]+}} -; CHECK: fcvtzs [[SIG:x[0-9]+]], {{s[0-9]+}} +; CHECK-DAG: fcvtzu [[UNSIG:x[0-9]+]], {{s[0-9]+}} +; CHECK-DAG: fcvtzs [[SIG:x[0-9]+]], {{s[0-9]+}} %res = sub i64 %signed, %unsigned ; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]] @@ -46,12 +46,12 @@ define i64 @test_floattoi64(float %in) { } define i64 @test_doubletoi64(double %in) { -; CHECK: test_doubletoi64: +; CHECK-LABEL: test_doubletoi64: %signed = fptosi double %in to i64 %unsigned = fptoui double %in to i64 -; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{d[0-9]+}} -; CHECK: fcvtzs [[SIG:x[0-9]+]], {{d[0-9]+}} +; CHECK-DAG: fcvtzu [[UNSIG:x[0-9]+]], {{d[0-9]+}} +; CHECK-DAG: fcvtzs [[SIG:x[0-9]+]], {{d[0-9]+}} %res = sub i64 %signed, %unsigned ; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]] @@ -61,12 +61,12 @@ define i64 @test_doubletoi64(double %in) { } define float @test_i32tofloat(i32 %in) { -; CHECK: test_i32tofloat: +; CHECK-LABEL: test_i32tofloat: %signed = sitofp i32 %in to float %unsigned = uitofp i32 %in to float -; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{w[0-9]+}} -; CHECK: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}} +; CHECK-DAG: ucvtf [[UNSIG:s[0-9]+]], {{w[0-9]+}} +; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}} %res = fsub float %signed, %unsigned ; CHECL: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]] @@ -75,12 +75,12 @@ define float @test_i32tofloat(i32 %in) { } define double @test_i32todouble(i32 %in) { -; CHECK: test_i32todouble: +; CHECK-LABEL: test_i32todouble: %signed = sitofp i32 %in to double %unsigned = uitofp i32 %in to double -; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{w[0-9]+}} -; CHECK: scvtf [[SIG:d[0-9]+]], {{w[0-9]+}} +; CHECK-DAG: ucvtf [[UNSIG:d[0-9]+]], {{w[0-9]+}} +; CHECK-DAG: scvtf [[SIG:d[0-9]+]], {{w[0-9]+}} %res = fsub double %signed, %unsigned ; CHECK: fsub {{d[0-9]+}}, [[SIG]], [[UNSIG]] @@ -89,12 +89,12 @@ define double @test_i32todouble(i32 %in) { } define float @test_i64tofloat(i64 %in) { -; CHECK: test_i64tofloat: +; CHECK-LABEL: test_i64tofloat: %signed = sitofp i64 %in to float %unsigned = uitofp i64 %in to float -; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{x[0-9]+}} -; CHECK: scvtf [[SIG:s[0-9]+]], {{x[0-9]+}} +; CHECK-DAG: ucvtf [[UNSIG:s[0-9]+]], {{x[0-9]+}} +; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{x[0-9]+}} %res = fsub float %signed, %unsigned ; CHECK: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]] @@ -103,12 +103,12 @@ define float @test_i64tofloat(i64 %in) { } define double @test_i64todouble(i64 %in) { -; CHECK: test_i64todouble: +; CHECK-LABEL: test_i64todouble: %signed = sitofp i64 %in to double %unsigned = uitofp i64 %in to double -; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{x[0-9]+}} -; CHECK: scvtf [[SIG:d[0-9]+]], {{x[0-9]+}} +; CHECK-DAG: ucvtf [[UNSIG:d[0-9]+]], {{x[0-9]+}} +; CHECK-DAG: scvtf [[SIG:d[0-9]+]], {{x[0-9]+}} %res = fsub double %signed, %unsigned ; CHECK: sub {{d[0-9]+}}, [[SIG]], [[UNSIG]] @@ -117,7 +117,7 @@ define double @test_i64todouble(i64 %in) { } define i32 @test_bitcastfloattoi32(float %in) { -; CHECK: test_bitcastfloattoi32: +; CHECK-LABEL: test_bitcastfloattoi32: %res = bitcast float %in to i32 ; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}} @@ -125,7 +125,7 @@ define i32 @test_bitcastfloattoi32(float %in) { } define i64 @test_bitcastdoubletoi64(double %in) { -; CHECK: test_bitcastdoubletoi64: +; CHECK-LABEL: test_bitcastdoubletoi64: %res = bitcast double %in to i64 ; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} @@ -133,7 +133,7 @@ define i64 @test_bitcastdoubletoi64(double %in) { } define float @test_bitcasti32tofloat(i32 %in) { -; CHECK: test_bitcasti32tofloat: +; CHECK-LABEL: test_bitcasti32tofloat: %res = bitcast i32 %in to float ; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}} @@ -142,7 +142,7 @@ define float @test_bitcasti32tofloat(i32 %in) { } define double @test_bitcasti64todouble(i64 %in) { -; CHECK: test_bitcasti64todouble: +; CHECK-LABEL: test_bitcasti64todouble: %res = bitcast i64 %in to double ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll index 940c146f0a9ff..e99c72833997e 100644 --- a/test/CodeGen/AArch64/flags-multiuse.ll +++ b/test/CodeGen/AArch64/flags-multiuse.ll @@ -9,7 +9,7 @@ declare void @bar() @var = global i32 0 define i32 @test_multiflag(i32 %n, i32 %m, i32 %o) { -; CHECK: test_multiflag: +; CHECK-LABEL: test_multiflag: %test = icmp ne i32 %n, %m ; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]] diff --git a/test/CodeGen/AArch64/floatdp_1source.ll b/test/CodeGen/AArch64/floatdp_1source.ll index c94ba9b57b5a0..3d7f8f0369fc4 100644 --- a/test/CodeGen/AArch64/floatdp_1source.ll +++ b/test/CodeGen/AArch64/floatdp_1source.ll @@ -26,7 +26,7 @@ declare float @nearbyintf(float) readonly declare double @nearbyint(double) readonly define void @simple_float() { -; CHECK: simple_float: +; CHECK-LABEL: simple_float: %val1 = load volatile float* @varfloat %valabs = call float @fabsf(float %val1) @@ -65,7 +65,7 @@ define void @simple_float() { } define void @simple_double() { -; CHECK: simple_double: +; CHECK-LABEL: simple_double: %val1 = load volatile double* @vardouble %valabs = call double @fabs(double %val1) @@ -104,7 +104,7 @@ define void @simple_double() { } define void @converts() { -; CHECK: converts: +; CHECK-LABEL: converts: %val16 = load volatile half* @varhalf %val32 = load volatile float* @varfloat diff --git a/test/CodeGen/AArch64/floatdp_2source.ll b/test/CodeGen/AArch64/floatdp_2source.ll index b2256b342acf5..bb655285ac540 100644 --- a/test/CodeGen/AArch64/floatdp_2source.ll +++ b/test/CodeGen/AArch64/floatdp_2source.ll @@ -4,7 +4,7 @@ @vardouble = global double 0.0 define void @testfloat() { -; CHECK: testfloat: +; CHECK-LABEL: testfloat: %val1 = load float* @varfloat %val2 = fadd float %val1, %val1 @@ -32,7 +32,7 @@ define void @testfloat() { } define void @testdouble() { -; CHECK: testdouble: +; CHECK-LABEL: testdouble: %val1 = load double* @vardouble %val2 = fadd double %val1, %val1 diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll index 56e8f16f9b36f..572f42e210b15 100644 --- a/test/CodeGen/AArch64/fp-cond-sel.ll +++ b/test/CodeGen/AArch64/fp-cond-sel.ll @@ -4,7 +4,7 @@ @vardouble = global double 0.0 define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) { -; CHECK: test_csel: +; CHECK-LABEL: test_csel: %tst1 = icmp ugt i32 %lhs32, %rhs32 %val1 = select i1 %tst1, float 0.0, float 1.0 diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll index 39db9be15771b..590557f1e8edc 100644 --- a/test/CodeGen/AArch64/fp-dp3.ll +++ b/test/CodeGen/AArch64/fp-dp3.ll @@ -1,102 +1,137 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST declare float @llvm.fma.f32(float, float, float) declare double @llvm.fma.f64(double, double, double) define float @test_fmadd(float %a, float %b, float %c) { -; CHECK: test_fmadd: +; CHECK-LABEL: test_fmadd: +; CHECK-NOFAST-LABEL: test_fmadd: %val = call float @llvm.fma.f32(float %a, float %b, float %c) ; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define float @test_fmsub(float %a, float %b, float %c) { -; CHECK: test_fmsub: +; CHECK-LABEL: test_fmsub: +; CHECK-NOFAST-LABEL: test_fmsub: %nega = fsub float -0.0, %a %val = call float @llvm.fma.f32(float %nega, float %b, float %c) ; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define float @test_fnmadd(float %a, float %b, float %c) { -; CHECK: test_fnmadd: +; CHECK-LABEL: test_fnmadd: +; CHECK-NOFAST-LABEL: test_fnmadd: %negc = fsub float -0.0, %c %val = call float @llvm.fma.f32(float %a, float %b, float %negc) ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define float @test_fnmsub(float %a, float %b, float %c) { -; CHECK: test_fnmsub: +; CHECK-LABEL: test_fnmsub: +; CHECK-NOFAST-LABEL: test_fnmsub: %nega = fsub float -0.0, %a %negc = fsub float -0.0, %c %val = call float @llvm.fma.f32(float %nega, float %b, float %negc) ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define double @testd_fmadd(double %a, double %b, double %c) { -; CHECK: testd_fmadd: +; CHECK-LABEL: testd_fmadd: +; CHECK-NOFAST-LABEL: testd_fmadd: %val = call double @llvm.fma.f64(double %a, double %b, double %c) ; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define double @testd_fmsub(double %a, double %b, double %c) { -; CHECK: testd_fmsub: +; CHECK-LABEL: testd_fmsub: +; CHECK-NOFAST-LABEL: testd_fmsub: %nega = fsub double -0.0, %a %val = call double @llvm.fma.f64(double %nega, double %b, double %c) ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define double @testd_fnmadd(double %a, double %b, double %c) { -; CHECK: testd_fnmadd: +; CHECK-LABEL: testd_fnmadd: +; CHECK-NOFAST-LABEL: testd_fnmadd: %negc = fsub double -0.0, %c %val = call double @llvm.fma.f64(double %a, double %b, double %negc) ; CHECK: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define double @testd_fnmsub(double %a, double %b, double %c) { -; CHECK: testd_fnmsub: +; CHECK-LABEL: testd_fnmsub: +; CHECK-NOFAST-LABEL: testd_fnmsub: %nega = fsub double -0.0, %a %negc = fsub double -0.0, %c %val = call double @llvm.fma.f64(double %nega, double %b, double %negc) ; CHECK: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define float @test_fmadd_unfused(float %a, float %b, float %c) { -; CHECK: test_fmadd_unfused: +; CHECK-LABEL: test_fmadd_unfused: +; CHECK-NOFAST-LABEL: test_fmadd_unfused: %prod = fmul float %b, %c %sum = fadd float %a, %prod ; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %sum } define float @test_fmsub_unfused(float %a, float %b, float %c) { -; CHECK: test_fmsub_unfused: +; CHECK-LABEL: test_fmsub_unfused: +; CHECK-NOFAST-LABEL: test_fmsub_unfused: %prod = fmul float %b, %c %diff = fsub float %a, %prod ; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %diff } define float @test_fnmadd_unfused(float %a, float %b, float %c) { -; CHECK: test_fnmadd_unfused: +; CHECK-LABEL: test_fnmadd_unfused: +; CHECK-NOFAST-LABEL: test_fnmadd_unfused: %nega = fsub float -0.0, %a %prod = fmul float %b, %c %sum = fadd float %nega, %prod ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %sum } define float @test_fnmsub_unfused(float %a, float %b, float %c) { -; CHECK: test_fnmsub_unfused: +; CHECK-LABEL: test_fnmsub_unfused: +; CHECK-NOFAST-LABEL: test_fnmsub_unfused: %nega = fsub float -0.0, %a %prod = fmul float %b, %c %diff = fsub float %nega, %prod ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-DAG: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-DAG: fneg {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-DAG: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: ret ret float %diff } diff --git a/test/CodeGen/AArch64/fp128-folding.ll b/test/CodeGen/AArch64/fp128-folding.ll index b5bdcf4f37b44..b1c560d2b6486 100644 --- a/test/CodeGen/AArch64/fp128-folding.ll +++ b/test/CodeGen/AArch64/fp128-folding.ll @@ -5,7 +5,7 @@ declare void @bar(i8*, i8*, i32*) ; which is not supported. define fp128 @test_folding() { -; CHECK: test_folding: +; CHECK-LABEL: test_folding: %l = alloca i32 store i32 42, i32* %l %val = load i32* %l @@ -14,4 +14,4 @@ define fp128 @test_folding() { ; successfully. ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI ret fp128 %fpval -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/fp128.ll index 258d34b8f81ff..c312bb1917ab8 100644 --- a/test/CodeGen/AArch64/fp128.ll +++ b/test/CodeGen/AArch64/fp128.ll @@ -4,7 +4,7 @@ @rhs = global fp128 zeroinitializer define fp128 @test_add() { -; CHECK: test_add: +; CHECK-LABEL: test_add: %lhs = load fp128* @lhs %rhs = load fp128* @rhs @@ -17,7 +17,7 @@ define fp128 @test_add() { } define fp128 @test_sub() { -; CHECK: test_sub: +; CHECK-LABEL: test_sub: %lhs = load fp128* @lhs %rhs = load fp128* @rhs @@ -30,7 +30,7 @@ define fp128 @test_sub() { } define fp128 @test_mul() { -; CHECK: test_mul: +; CHECK-LABEL: test_mul: %lhs = load fp128* @lhs %rhs = load fp128* @rhs @@ -43,7 +43,7 @@ define fp128 @test_mul() { } define fp128 @test_div() { -; CHECK: test_div: +; CHECK-LABEL: test_div: %lhs = load fp128* @lhs %rhs = load fp128* @rhs @@ -59,7 +59,7 @@ define fp128 @test_div() { @var64 = global i64 0 define void @test_fptosi() { -; CHECK: test_fptosi: +; CHECK-LABEL: test_fptosi: %val = load fp128* @lhs %val32 = fptosi fp128 %val to i32 @@ -74,7 +74,7 @@ define void @test_fptosi() { } define void @test_fptoui() { -; CHECK: test_fptoui: +; CHECK-LABEL: test_fptoui: %val = load fp128* @lhs %val32 = fptoui fp128 %val to i32 @@ -89,7 +89,7 @@ define void @test_fptoui() { } define void @test_sitofp() { -; CHECK: test_sitofp: +; CHECK-LABEL: test_sitofp: %src32 = load i32* @var32 %val32 = sitofp i32 %src32 to fp128 @@ -105,7 +105,7 @@ define void @test_sitofp() { } define void @test_uitofp() { -; CHECK: test_uitofp: +; CHECK-LABEL: test_uitofp: %src32 = load i32* @var32 %val32 = uitofp i32 %src32 to fp128 @@ -121,7 +121,7 @@ define void @test_uitofp() { } define i1 @test_setcc1() { -; CHECK: test_setcc1: +; CHECK-LABEL: test_setcc1: %lhs = load fp128* @lhs %rhs = load fp128* @rhs @@ -140,7 +140,7 @@ define i1 @test_setcc1() { } define i1 @test_setcc2() { -; CHECK: test_setcc2: +; CHECK-LABEL: test_setcc2: %lhs = load fp128* @lhs %rhs = load fp128* @rhs @@ -150,14 +150,14 @@ define i1 @test_setcc2() { ; Technically, everything after the call to __letf2 is redundant, but we'll let ; LLVM have its fun for now. %val = fcmp ugt fp128 %lhs, %rhs -; CHECK: bl __unordtf2 -; CHECK: mov x[[UNORDERED:[0-9]+]], x0 - ; CHECK: bl __gttf2 ; CHECK: cmp w0, #0 ; CHECK: csinc [[GT:w[0-9]+]], wzr, wzr, le -; CHECK: cmp w[[UNORDERED]], #0 + +; CHECK: bl __unordtf2 +; CHECK: cmp w0, #0 ; CHECK: csinc [[UNORDERED:w[0-9]+]], wzr, wzr, eq + ; CHECK: orr w0, [[UNORDERED]], [[GT]] ret i1 %val @@ -165,7 +165,7 @@ define i1 @test_setcc2() { } define i32 @test_br_cc() { -; CHECK: test_br_cc: +; CHECK-LABEL: test_br_cc: %lhs = load fp128* @lhs %rhs = load fp128* @rhs @@ -174,15 +174,14 @@ define i32 @test_br_cc() { ; olt == !uge, which LLVM unfortunately "optimizes" this to. %cond = fcmp olt fp128 %lhs, %rhs -; CHECK: bl __unordtf2 -; CHECK: mov x[[UNORDERED:[0-9]+]], x0 - ; CHECK: bl __getf2 ; CHECK: cmp w0, #0 - ; CHECK: csinc [[OGE:w[0-9]+]], wzr, wzr, lt -; CHECK: cmp w[[UNORDERED]], #0 + +; CHECK: bl __unordtf2 +; CHECK: cmp w0, #0 ; CHECK: csinc [[UNORDERED:w[0-9]+]], wzr, wzr, eq + ; CHECK: orr [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]] ; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]] br i1 %cond, label %iftrue, label %iffalse @@ -202,7 +201,7 @@ iffalse: } define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) { -; CHECK: test_select: +; CHECK-LABEL: test_select: %val = select i1 %cond, fp128 %lhs, fp128 %rhs store fp128 %val, fp128* @lhs @@ -222,7 +221,7 @@ define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) { @vardouble = global double 0.0 define void @test_round() { -; CHECK: test_round: +; CHECK-LABEL: test_round: %val = load fp128* @lhs @@ -240,7 +239,7 @@ define void @test_round() { } define void @test_extend() { -; CHECK: test_extend: +; CHECK-LABEL: test_extend: %val = load fp128* @lhs @@ -265,7 +264,7 @@ define fp128 @test_neg(fp128 %in) { ; Make sure the weird hex constant below *is* -0.0 ; CHECK-NEXT: fp128 -0 -; CHECK: test_neg: +; CHECK-LABEL: test_neg: ; Could in principle be optimized to fneg which we can't select, this makes ; sure that doesn't happen. diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll index fd28aeef92914..b8f716959449f 100644 --- a/test/CodeGen/AArch64/fpimm.ll +++ b/test/CodeGen/AArch64/fpimm.ll @@ -4,31 +4,33 @@ @varf64 = global double 0.0 define void @check_float() { -; CHECK: check_float: +; CHECK-LABEL: check_float: %val = load float* @varf32 %newval1 = fadd float %val, 8.5 store volatile float %newval1, float* @varf32 -; CHECK: fmov {{s[0-9]+}}, #8.5 +; CHECK-DAG: fmov [[EIGHT5:s[0-9]+]], #8.5 %newval2 = fadd float %val, 128.0 store volatile float %newval2, float* @varf32 -; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI0_0 +; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI0_0 +; CHECK: ret ret void } define void @check_double() { -; CHECK: check_double: +; CHECK-LABEL: check_double: %val = load double* @varf64 %newval1 = fadd double %val, 8.5 store volatile double %newval1, double* @varf64 -; CHECK: fmov {{d[0-9]+}}, #8.5 +; CHECK-DAG: fmov {{d[0-9]+}}, #8.5 %newval2 = fadd double %val, 128.0 store volatile double %newval2, double* @varf64 -; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0 +; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0 +; CHECK: ret ret void } diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll new file mode 100644 index 0000000000000..182704bd6541a --- /dev/null +++ b/test/CodeGen/AArch64/frameaddr.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s + +define i8* @t() nounwind { +entry: +; CHECK-LABEL: t: +; CHECK: mov x0, x29 + %0 = call i8* @llvm.frameaddress(i32 0) + ret i8* %0 +} + +define i8* @t2() nounwind { +entry: +; CHECK-LABEL: t2: +; CHECK: ldr x[[reg:[0-9]+]], [x29] +; CHECK: ldr x[[reg]], [x[[reg]]] + %0 = call i8* @llvm.frameaddress(i32 2) + ret i8* %0 +} + +declare i8* @llvm.frameaddress(i32) nounwind readnone diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll index 78fde6a3c33ab..430d77f9e9327 100644 --- a/test/CodeGen/AArch64/func-argpassing.ll +++ b/test/CodeGen/AArch64/func-argpassing.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s %myStruct = type { i64 , i8, i32 } @@ -11,7 +12,7 @@ @varstruct = global %myStruct zeroinitializer define void @take_i8s(i8 %val1, i8 %val2) { -; CHECK: take_i8s: +; CHECK-LABEL: take_i8s: store i8 %val2, i8* @var8 ; Not using w1 may be technically allowed, but it would indicate a ; problem in itself. @@ -20,9 +21,10 @@ define void @take_i8s(i8 %val1, i8 %val2) { } define void @add_floats(float %val1, float %val2) { -; CHECK: add_floats: +; CHECK-LABEL: add_floats: %newval = fadd float %val1, %val2 ; CHECK: fadd [[ADDRES:s[0-9]+]], s0, s1 +; CHECK-NOFP-NOT: fadd store float %newval, float* @varfloat ; CHECK: str [[ADDRES]], [{{x[0-9]+}}, #:lo12:varfloat] ret void @@ -31,19 +33,19 @@ define void @add_floats(float %val1, float %val2) { ; byval pointers should be allocated to the stack and copied as if ; with memcpy. define void @take_struct(%myStruct* byval %structval) { -; CHECK: take_struct: +; CHECK-LABEL: take_struct: %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2 %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0 - %val0 = load i32* %addr0 + %val0 = load volatile i32* %addr0 ; Some weird move means x0 is used for one access ; CHECK: ldr [[REG32:w[0-9]+]], [{{x[0-9]+|sp}}, #12] - store i32 %val0, i32* @var32 + store volatile i32 %val0, i32* @var32 ; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32] - %val1 = load i64* %addr1 + %val1 = load volatile i64* %addr1 ; CHECK: ldr [[REG64:x[0-9]+]], [{{x[0-9]+|sp}}] - store i64 %val1, i64* @var64 + store volatile i64 %val1, i64* @var64 ; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64] ret void @@ -51,19 +53,19 @@ define void @take_struct(%myStruct* byval %structval) { ; %structval should be at sp + 16 define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %structval) { -; CHECK: check_byval_align: +; CHECK-LABEL: check_byval_align: %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2 %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0 - %val0 = load i32* %addr0 + %val0 = load volatile i32* %addr0 ; Some weird move means x0 is used for one access ; CHECK: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16 ; CHECK: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12] store i32 %val0, i32* @var32 ; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32] - %val1 = load i64* %addr1 + %val1 = load volatile i64* %addr1 ; CHECK: ldr [[REG64:x[0-9]+]], [sp, #16] store i64 %val1, i64* @var64 ; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64] @@ -72,7 +74,7 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st } define i32 @return_int() { -; CHECK: return_int: +; CHECK-LABEL: return_int: %val = load i32* @var32 ret i32 %val ; CHECK: ldr w0, [{{x[0-9]+}}, #:lo12:var32] @@ -81,16 +83,17 @@ define i32 @return_int() { } define double @return_double() { -; CHECK: return_double: +; CHECK-LABEL: return_double: ret double 3.14 ; CHECK: ldr d0, [{{x[0-9]+}}, #:lo12:.LCPI +; CHECK-NOFP-NOT: ldr d0, } ; This is the kind of IR clang will produce for returning a struct ; small enough to go into registers. Not all that pretty, but it ; works. define [2 x i64] @return_struct() { -; CHECK: return_struct: +; CHECK-LABEL: return_struct: %addr = bitcast %myStruct* @varstruct to [2 x i64]* %val = load [2 x i64]* %addr ret [2 x i64] %val @@ -107,7 +110,7 @@ define [2 x i64] @return_struct() { ; structs larger than 16 bytes, but C semantics can still be provided ; if LLVM does it to %myStruct too. So this is the simplest check define void @return_large_struct(%myStruct* sret %retval) { -; CHECK: return_large_struct: +; CHECK-LABEL: return_large_struct: %addr0 = getelementptr %myStruct* %retval, i64 0, i32 0 %addr1 = getelementptr %myStruct* %retval, i64 0, i32 1 %addr2 = getelementptr %myStruct* %retval, i64 0, i32 2 @@ -128,19 +131,20 @@ define void @return_large_struct(%myStruct* sret %retval) { define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var45, i32* %var6, %myStruct* byval %struct, i32* byval %stacked, double %notstacked) { -; CHECK: struct_on_stack: +; CHECK-LABEL: struct_on_stack: %addr = getelementptr %myStruct* %struct, i64 0, i32 0 - %val64 = load i64* %addr - store i64 %val64, i64* @var64 + %val64 = load volatile i64* %addr + store volatile i64 %val64, i64* @var64 ; Currently nothing on local stack, so struct should be at sp ; CHECK: ldr [[VAL64:x[0-9]+]], [sp] ; CHECK: str [[VAL64]], [{{x[0-9]+}}, #:lo12:var64] - store double %notstacked, double* @vardouble + store volatile double %notstacked, double* @vardouble ; CHECK-NOT: ldr d0 ; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble +; CHECK-NOFP-NOT: str d0, - %retval = load i32* %stacked + %retval = load volatile i32* %stacked ret i32 %retval ; CHECK: ldr w0, [sp, #16] } @@ -148,7 +152,7 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3, float %var4, float %var5, float %var6, float %var7, float %var8) { -; CHECK: stacked_fpu: +; CHECK-LABEL: stacked_fpu: store float %var8, float* @varfloat ; Beware as above: the offset would be different on big-endian ; machines if the first ldr were changed to use s-registers. @@ -176,17 +180,17 @@ define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3, ; CHECK: check_i128_stackalign store i128 %stack2, i128* @var128 ; Nothing local on stack in current codegen, so first stack is 16 away -; CHECK: ldr {{x[0-9]+}}, [sp, #16] +; CHECK: add x[[REG:[0-9]+]], sp, #16 +; CHECK: ldr {{x[0-9]+}}, [x[[REG]], #8] ; Important point is that we address sp+24 for second dword -; CHECK: add [[REG:x[0-9]+]], sp, #16 -; CHECK: ldr {{x[0-9]+}}, {{\[}}[[REG]], #8] +; CHECK: ldr {{x[0-9]+}}, [sp, #16] ret void } declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1) define i32 @test_extern() { -; CHECK: test_extern: +; CHECK-LABEL: test_extern: call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* undef, i32 undef, i32 4, i1 0) ; CHECK: bl memcpy ret i32 0 diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll index 13b689c408869..ac188bb3bb573 100644 --- a/test/CodeGen/AArch64/func-calls.ll +++ b/test/CodeGen/AArch64/func-calls.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s %myStruct = type { i64 , i8, i32 } @@ -17,20 +18,22 @@ declare void @take_i8s(i8 %val1, i8 %val2) declare void @take_floats(float %val1, float %val2) define void @simple_args() { -; CHECK: simple_args: +; CHECK-LABEL: simple_args: %char1 = load i8* @var8 %char2 = load i8* @var8_2 call void @take_i8s(i8 %char1, i8 %char2) -; CHECK: ldrb w0, [{{x[0-9]+}}, #:lo12:var8] -; CHECK: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2] +; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, #:lo12:var8] +; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2] ; CHECK: bl take_i8s %float1 = load float* @varfloat %float2 = load float* @varfloat_2 call void @take_floats(float %float1, float %float2) -; CHECK: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2] -; CHECK: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat] +; CHECK-DAG: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2] +; CHECK-DAG: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat] ; CHECK: bl take_floats +; CHECK-NOFP-NOT: ldr s1, +; CHECK-NOFP-NOT: ldr s0, ret void } @@ -41,7 +44,7 @@ declare [2 x i64] @return_smallstruct() declare void @return_large_struct(%myStruct* sret %retval) define void @simple_rets() { -; CHECK: simple_rets: +; CHECK-LABEL: simple_rets: %int = call i32 @return_int() store i32 %int, i32* @var32 @@ -52,6 +55,7 @@ define void @simple_rets() { store double %dbl, double* @vardouble ; CHECK: bl return_double ; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble] +; CHECK-NOFP-NOT: str d0, %arr = call [2 x i64] @return_smallstruct() store [2 x i64] %arr, [2 x i64]* @varsmallstruct @@ -75,17 +79,19 @@ declare void @stacked_fpu(float %var0, double %var1, float %var2, float %var3, float %var8) define void @check_stack_args() { +; CHECK-LABEL: check_stack_args: call i32 @struct_on_stack(i8 0, i16 12, i32 42, i64 99, i128 1, i32* @var32, %myStruct* byval @varstruct, i32 999, double 1.0) ; Want to check that the final double is passed in registers and ; that varstruct is passed on the stack. Rather dependent on how a ; memcpy gets created, but the following works for now. -; CHECK: mov x0, sp -; CHECK: str {{w[0-9]+}}, [x0] -; CHECK: str {{w[0-9]+}}, [x0, #12] -; CHECK: fmov d0, +; CHECK: mov x[[SPREG:[0-9]+]], sp +; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]]] +; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]], #12] +; CHECK-DAG: fmov d0, ; CHECK: bl struct_on_stack +; CHECK-NOFP-NOT: fmov call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0, float -2.0, float -8.0, float 16.0, float 1.0, @@ -106,7 +112,7 @@ declare void @check_i128_regalign(i32 %val0, i128 %val1) define void @check_i128_align() { -; CHECK: check_i128_align: +; CHECK-LABEL: check_i128_align: %val = load i128* @var128 call void @check_i128_stackalign(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, @@ -130,7 +136,7 @@ define void @check_i128_align() { @fptr = global void()* null define void @check_indirect_call() { -; CHECK: check_indirect_call: +; CHECK-LABEL: check_indirect_call: %func = load void()** @fptr call void %func() ; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, #:lo12:fptr] diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll index 8ed6e551cdebc..56e5cba519c1b 100644 --- a/test/CodeGen/AArch64/global-alignment.ll +++ b/test/CodeGen/AArch64/global-alignment.ll @@ -5,7 +5,7 @@ @var32_align64 = global [3 x i32] zeroinitializer, align 8 define i64 @test_align32() { -; CHECK: test_align32: +; CHECK-LABEL: test_align32: %addr = bitcast [3 x i32]* @var32 to i64* ; Since @var32 is only guaranteed to be aligned to 32-bits, it's invalid to @@ -19,7 +19,7 @@ define i64 @test_align32() { } define i64 @test_align64() { -; CHECK: test_align64: +; CHECK-LABEL: test_align64: %addr = bitcast [3 x i64]* @var64 to i64* ; However, var64 *is* properly aligned and emitting an adrp/add/ldr would be @@ -33,7 +33,7 @@ define i64 @test_align64() { } define i64 @test_var32_align64() { -; CHECK: test_var32_align64: +; CHECK-LABEL: test_var32_align64: %addr = bitcast [3 x i32]* @var32_align64 to i64* ; Since @var32 is only guaranteed to be aligned to 32-bits, it's invalid to @@ -49,7 +49,7 @@ define i64 @test_var32_align64() { @yet_another_var = external global {i32, i32} define i64 @test_yet_another_var() { -; CHECK: test_yet_another_var: +; CHECK-LABEL: test_yet_another_var: ; @yet_another_var has a preferred alignment of 8, but that's not enough if ; we're going to be linking against other things. Its ABI alignment is only 4 @@ -62,7 +62,7 @@ define i64 @test_yet_another_var() { } define i64()* @test_functions() { -; CHECK: test_functions: +; CHECK-LABEL: test_functions: ret i64()* @test_yet_another_var ; CHECK: adrp [[HIBITS:x[0-9]+]], test_yet_another_var ; CHECK: add x0, [[HIBITS]], #:lo12:test_yet_another_var diff --git a/test/CodeGen/AArch64/got-abuse.ll b/test/CodeGen/AArch64/got-abuse.ll index c474e5845a64f..8b06031c88f73 100644 --- a/test/CodeGen/AArch64/got-abuse.ll +++ b/test/CodeGen/AArch64/got-abuse.ll @@ -13,7 +13,7 @@ declare void @consume(i32) declare void @func() define void @foo() nounwind { -; CHECK: foo: +; CHECK-LABEL: foo: entry: call void @consume(i32 ptrtoint (void ()* @func to i32)) ; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:func diff --git a/test/CodeGen/AArch64/i128-align.ll b/test/CodeGen/AArch64/i128-align.ll index f019ea0a6706a..21ca7eda66bb7 100644 --- a/test/CodeGen/AArch64/i128-align.ll +++ b/test/CodeGen/AArch64/i128-align.ll @@ -5,7 +5,7 @@ @var = global %struct zeroinitializer define i64 @check_size() { -; CHECK: check_size: +; CHECK-LABEL: check_size: %starti = ptrtoint %struct* @var to i64 %endp = getelementptr %struct* @var, i64 1 @@ -17,7 +17,7 @@ define i64 @check_size() { } define i64 @check_field() { -; CHECK: check_field: +; CHECK-LABEL: check_field: %starti = ptrtoint %struct* @var to i64 %endp = getelementptr %struct* @var, i64 0, i32 1 @@ -26,4 +26,4 @@ define i64 @check_field() { %diff = sub i64 %endi, %starti ret i64 %diff ; CHECK: movz x0, #16 -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll index 446151b8ffacd..03c6d8d10087b 100644 --- a/test/CodeGen/AArch64/illegal-float-ops.ll +++ b/test/CodeGen/AArch64/illegal-float-ops.ll @@ -9,7 +9,7 @@ declare double @llvm.cos.f64(double) declare fp128 @llvm.cos.f128(fp128) define void @test_cos(float %float, double %double, fp128 %fp128) { -; CHECK: test_cos: +; CHECK-LABEL: test_cos: %cosfloat = call float @llvm.cos.f32(float %float) store float %cosfloat, float* @varfloat @@ -31,7 +31,7 @@ declare double @llvm.exp.f64(double) declare fp128 @llvm.exp.f128(fp128) define void @test_exp(float %float, double %double, fp128 %fp128) { -; CHECK: test_exp: +; CHECK-LABEL: test_exp: %expfloat = call float @llvm.exp.f32(float %float) store float %expfloat, float* @varfloat @@ -53,7 +53,7 @@ declare double @llvm.exp2.f64(double) declare fp128 @llvm.exp2.f128(fp128) define void @test_exp2(float %float, double %double, fp128 %fp128) { -; CHECK: test_exp2: +; CHECK-LABEL: test_exp2: %exp2float = call float @llvm.exp2.f32(float %float) store float %exp2float, float* @varfloat @@ -75,7 +75,7 @@ declare double @llvm.log.f64(double) declare fp128 @llvm.log.f128(fp128) define void @test_log(float %float, double %double, fp128 %fp128) { -; CHECK: test_log: +; CHECK-LABEL: test_log: %logfloat = call float @llvm.log.f32(float %float) store float %logfloat, float* @varfloat @@ -97,7 +97,7 @@ declare double @llvm.log2.f64(double) declare fp128 @llvm.log2.f128(fp128) define void @test_log2(float %float, double %double, fp128 %fp128) { -; CHECK: test_log2: +; CHECK-LABEL: test_log2: %log2float = call float @llvm.log2.f32(float %float) store float %log2float, float* @varfloat @@ -119,7 +119,7 @@ declare double @llvm.log10.f64(double) declare fp128 @llvm.log10.f128(fp128) define void @test_log10(float %float, double %double, fp128 %fp128) { -; CHECK: test_log10: +; CHECK-LABEL: test_log10: %log10float = call float @llvm.log10.f32(float %float) store float %log10float, float* @varfloat @@ -141,7 +141,7 @@ declare double @llvm.sin.f64(double) declare fp128 @llvm.sin.f128(fp128) define void @test_sin(float %float, double %double, fp128 %fp128) { -; CHECK: test_sin: +; CHECK-LABEL: test_sin: %sinfloat = call float @llvm.sin.f32(float %float) store float %sinfloat, float* @varfloat @@ -163,7 +163,7 @@ declare double @llvm.pow.f64(double, double) declare fp128 @llvm.pow.f128(fp128, fp128) define void @test_pow(float %float, double %double, fp128 %fp128) { -; CHECK: test_pow: +; CHECK-LABEL: test_pow: %powfloat = call float @llvm.pow.f32(float %float, float %float) store float %powfloat, float* @varfloat @@ -185,7 +185,7 @@ declare double @llvm.powi.f64(double, i32) declare fp128 @llvm.powi.f128(fp128, i32) define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) { -; CHECK: test_powi: +; CHECK-LABEL: test_powi: %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent) store float %powifloat, float* @varfloat @@ -203,7 +203,7 @@ define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128 } define void @test_frem(float %float, double %double, fp128 %fp128) { -; CHECK: test_frem: +; CHECK-LABEL: test_frem: %fremfloat = frem float %float, %float store float %fremfloat, float* @varfloat @@ -219,3 +219,29 @@ define void @test_frem(float %float, double %double, fp128 %fp128) { ret void } + +declare fp128 @llvm.fma.f128(fp128, fp128, fp128) + +define void @test_fma(fp128 %fp128) { +; CHECK-LABEL: test_fma: + + %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128) + store fp128 %fmafp128, fp128* @varfp128 +; CHECK: bl fmal + + ret void +} + +declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128) + +define void @test_fmuladd(fp128 %fp128) { +; CHECK-LABEL: test_fmuladd: + + %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128) + store fp128 %fmuladdfp128, fp128* @varfp128 +; CHECK-NOT: bl fmal +; CHECK: bl __multf3 +; CHECK: bl __addtf3 + + ret void +} diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll index d80be8f3a6394..3ff1c1a86ec64 100644 --- a/test/CodeGen/AArch64/init-array.ll +++ b/test/CodeGen/AArch64/init-array.ll @@ -6,4 +6,4 @@ define internal void @_GLOBAL__I_a() section ".text.startup" { @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }] -; CHECK: .section .init_array
\ No newline at end of file +; CHECK: .section .init_array diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll index c39c57f058224..61bbfc2013547 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll @@ -4,4 +4,4 @@ define void @foo() { ; Out of range immediate for I. call void asm sideeffect "add x0, x0, $0", "I"(i32 4096) ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll index 47c5f98bf009f..40746e1528ce3 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll @@ -4,4 +4,4 @@ define void @foo() { ; 32-bit bitpattern ending in 1101 can't be produced. call void asm sideeffect "and w0, w0, $0", "K"(i32 13) ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll index 7a5b99e23b3df..2c5338191fdeb 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll @@ -4,4 +4,4 @@ define void @foo() { ; 32-bit bitpattern ending in 1101 can't be produced. call void asm sideeffect "and w0, w0, $0", "K"(i64 4294967296) ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll index 4f0039865a352..d82d5a2ee4d00 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll @@ -4,4 +4,4 @@ define void @foo() { ; 32-bit bitpattern ending in 1101 can't be produced. call void asm sideeffect "and x0, x0, $0", "L"(i32 13) ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll index c232f3208cfa8..18a3b37b41d1c 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints.ll @@ -1,21 +1,21 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s define i64 @test_inline_constraint_r(i64 %base, i32 %offset) { -; CHECK: test_inline_constraint_r: +; CHECK-LABEL: test_inline_constraint_r: %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 %base, i32 %offset) ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw ret i64 %val } define i16 @test_small_reg(i16 %lhs, i16 %rhs) { -; CHECK: test_small_reg: +; CHECK-LABEL: test_small_reg: %val = call i16 asm sideeffect "add $0, $1, $2, sxth", "=r,r,r"(i16 %lhs, i16 %rhs) ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth ret i16 %val } define i64 @test_inline_constraint_r_imm(i64 %base, i32 %offset) { -; CHECK: test_inline_constraint_r_imm: +; CHECK-LABEL: test_inline_constraint_r_imm: %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 4, i32 12) ; CHECK: movz [[FOUR:x[0-9]+]], #4 ; CHECK: movz [[TWELVE:w[0-9]+]], #12 @@ -26,7 +26,7 @@ define i64 @test_inline_constraint_r_imm(i64 %base, i32 %offset) { ; m is permitted to have a base/offset form. We don't do that ; currently though. define i32 @test_inline_constraint_m(i32 *%ptr) { -; CHECK: test_inline_constraint_m: +; CHECK-LABEL: test_inline_constraint_m: %val = call i32 asm "ldr $0, $1", "=r,m"(i32 *%ptr) ; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}] ret i32 %val @@ -36,7 +36,7 @@ define i32 @test_inline_constraint_m(i32 *%ptr) { ; Q should *never* have base/offset form even if given the chance. define i32 @test_inline_constraint_Q(i32 *%ptr) { -; CHECK: test_inline_constraint_Q: +; CHECK-LABEL: test_inline_constraint_Q: %val = call i32 asm "ldr $0, $1", "=r,Q"(i32* getelementptr([8 x i32]* @arr, i32 0, i32 1)) ; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}] ret i32 %val @@ -44,8 +44,28 @@ define i32 @test_inline_constraint_Q(i32 *%ptr) { @dump = global fp128 zeroinitializer +define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) { +; CHECK: test_inline_constraint_w: + call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64) + call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128) +; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + + ; Arguably semantically dodgy to output "vN", but it's what GCC does + ; so purely for compatibility we want vector registers to be output. + call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef) + call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt) + call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl) + call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad) +; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}} +; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret void +} + define void @test_inline_constraint_I() { -; CHECK: test_inline_constraint_I: +; CHECK-LABEL: test_inline_constraint_I: call void asm sideeffect "add x0, x0, $0", "I"(i32 0) call void asm sideeffect "add x0, x0, $0", "I"(i64 4095) ; CHECK: add x0, x0, #0 @@ -57,7 +77,7 @@ define void @test_inline_constraint_I() { ; Skip J because it's useless define void @test_inline_constraint_K() { -; CHECK: test_inline_constraint_K: +; CHECK-LABEL: test_inline_constraint_K: call void asm sideeffect "and w0, w0, $0", "K"(i32 2863311530) ; = 0xaaaaaaaa call void asm sideeffect "and w0, w0, $0", "K"(i32 65535) ; CHECK: and w0, w0, #-1431655766 @@ -67,7 +87,7 @@ define void @test_inline_constraint_K() { } define void @test_inline_constraint_L() { -; CHECK: test_inline_constraint_L: +; CHECK-LABEL: test_inline_constraint_L: call void asm sideeffect "and x0, x0, $0", "L"(i64 4294967296) ; = 0xaaaaaaaa call void asm sideeffect "and x0, x0, $0", "L"(i64 65535) ; CHECK: and x0, x0, #4294967296 @@ -81,7 +101,7 @@ define void @test_inline_constraint_L() { @var = global i32 0 define void @test_inline_constraint_S() { -; CHECK: test_inline_constraint_S: +; CHECK-LABEL: test_inline_constraint_S: call void asm sideeffect "adrp x0, $0", "S"(i32* @var) call void asm sideeffect "adrp x0, ${0:A}", "S"(i32* @var) call void asm sideeffect "add x0, x0, ${0:L}", "S"(i32* @var) @@ -92,7 +112,7 @@ define void @test_inline_constraint_S() { } define i32 @test_inline_constraint_S_label(i1 %in) { -; CHECK: test_inline_constraint_S_label: +; CHECK-LABEL: test_inline_constraint_S_label: call void asm sideeffect "adr x0, $0", "S"(i8* blockaddress(@test_inline_constraint_S_label, %loc)) ; CHECK: adr x0, .Ltmp{{[0-9]+}} br i1 %in, label %loc, label %loc2 @@ -103,15 +123,15 @@ loc2: } define void @test_inline_constraint_Y() { -; CHECK: test_inline_constraint_Y: +; CHECK-LABEL: test_inline_constraint_Y: call void asm sideeffect "fcmp s0, $0", "Y"(float 0.0) ; CHECK: fcmp s0, #0.0 ret void } define void @test_inline_constraint_Z() { -; CHECK: test_inline_constraint_Z: +; CHECK-LABEL: test_inline_constraint_Z: call void asm sideeffect "cmp w0, $0", "Z"(i32 0) ; CHECK: cmp w0, #0 ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll index 3b55945561eb1..b7f4d3c57ba37 100644 --- a/test/CodeGen/AArch64/inline-asm-modifiers.ll +++ b/test/CodeGen/AArch64/inline-asm-modifiers.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s @var_simple = hidden global i32 0 @var_got = global i32 0 @@ -9,7 +8,7 @@ @var_tlsle = thread_local(localexec) global i32 0 define void @test_inline_modifier_L() nounwind { -; CHECK: test_inline_modifier_L: +; CHECK-LABEL: test_inline_modifier_L: call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_simple) call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_got) call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsgd) @@ -23,31 +22,28 @@ define void @test_inline_modifier_L() nounwind { ; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie] ; CHECK: add x0, x0, #:tprel_lo12:var_tlsle -; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC var_simple -; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var_got -; CHECK-ELF: R_AARCH64_TLSDESC_ADD_LO12_NC var_tlsgd -; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_LO12 var_tlsld -; CHECK-ELF: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC var_tlsie -; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_LO12 var_tlsle + call void asm sideeffect "add x0, x0, ${0:L}", "Si,~{x0}"(i32 64) + call void asm sideeffect "ldr x0, [x0, ${0:L}]", "Si,~{x0}"(i32 64) +; CHECK: add x0, x0, #64 +; CHECK: ldr x0, [x0, #64] ret void } define void @test_inline_modifier_G() nounwind { -; CHECK: test_inline_modifier_G: +; CHECK-LABEL: test_inline_modifier_G: call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsld) call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsle) ; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12 ; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12 -; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_HI12 var_tlsld -; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_HI12 var_tlsle - + call void asm sideeffect "add x0, x0, ${0:G}", "Si,~{x0}"(i32 42) +; CHECK: add x0, x0, #42 ret void } define void @test_inline_modifier_A() nounwind { -; CHECK: test_inline_modifier_A: +; CHECK-LABEL: test_inline_modifier_A: call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_simple) call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_got) call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsgd) @@ -58,16 +54,14 @@ define void @test_inline_modifier_A() nounwind { ; CHECK: adrp x0, :tlsdesc:var_tlsgd ; CHECK: adrp x0, :gottprel:var_tlsie -; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 var_simple -; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var_got -; CHECK-ELF: R_AARCH64_TLSDESC_ADR_PAGE var_tlsgd -; CHECK-ELF: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 var_tlsie + call void asm sideeffect "adrp x0, ${0:A}", "Si,~{x0}"(i32 40) +; CHECK: adrp x0, #40 ret void } define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind { -; CHECK: test_inline_modifier_wx: +; CHECK-LABEL: test_inline_modifier_wx: call i32 asm sideeffect "add $0, $0, $0", "=r,0"(i32 %small) call i32 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i32 %small) call i32 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i32 %small) @@ -87,11 +81,17 @@ define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind { call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0) ; CHECK: add {{w[0-9]+}}, wzr, wzr ; CHECK: add {{x[0-9]+}}, xzr, xzr + + call i32 asm sideeffect "add ${0:w}, ${0:w}, ${1:w}", "=r,Ir,0"(i32 123, i32 %small) + call i64 asm sideeffect "add ${0:x}, ${0:x}, ${1:x}", "=r,Ir,0"(i32 456, i64 %big) +; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #123 +; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #456 + ret void } define void @test_inline_modifier_bhsdq() nounwind { -; CHECK: test_inline_modifier_bhsdq: +; CHECK-LABEL: test_inline_modifier_bhsdq: call float asm sideeffect "ldr ${0:b}, [sp]", "=w"() call float asm sideeffect "ldr ${0:h}, [sp]", "=w"() call float asm sideeffect "ldr ${0:s}, [sp]", "=w"() @@ -113,13 +113,35 @@ define void @test_inline_modifier_bhsdq() nounwind { ; CHECK: ldr s0, [sp] ; CHECK: ldr d0, [sp] ; CHECK: ldr q0, [sp] + + call void asm sideeffect "fcmp b0, ${0:b}", "Yw"(float 0.0) + call void asm sideeffect "fcmp h0, ${0:h}", "Yw"(float 0.0) + call void asm sideeffect "fcmp s0, ${0:s}", "Yw"(float 0.0) + call void asm sideeffect "fcmp d0, ${0:d}", "Yw"(float 0.0) + call void asm sideeffect "fcmp q0, ${0:q}", "Yw"(float 0.0) +; CHECK: fcmp b0, #0 +; CHECK: fcmp h0, #0 +; CHECK: fcmp s0, #0 +; CHECK: fcmp d0, #0 +; CHECK: fcmp q0, #0 + ret void } define void @test_inline_modifier_c() nounwind { -; CHECK: test_inline_modifier_c: +; CHECK-LABEL: test_inline_modifier_c: call void asm sideeffect "adr x0, ${0:c}", "i"(i32 3) ; CHECK: adr x0, 3 ret void -}
\ No newline at end of file +} + +define void @test_inline_modifier_a() nounwind { +; CHECK-LABEL: test_inline_modifier_a: + call void asm sideeffect "prfm pldl1keep, ${0:a}", "r"(i32* @var_simple) +; CHECK: adrp [[VARHI:x[0-9]+]], var_simple +; CHECK: add x[[VARADDR:[0-9]+]], [[VARHI]], #:lo12:var_simple +; CHECK: prfm pldl1keep, [x[[VARADDR]]] + ret void +} + diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll index 3c7f5f9ec1b07..4bb094217af3c 100644 --- a/test/CodeGen/AArch64/jump-table.ll +++ b/test/CodeGen/AArch64/jump-table.ll @@ -1,6 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s ; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s -check-prefix=CHECK-ELF define i32 @test_jumptable(i32 %in) { ; CHECK: test_jumptable @@ -48,19 +47,3 @@ lbl4: ; CHECK-NEXT: .xword ; CHECK-NEXT: .xword ; CHECK-NEXT: .xword - -; ELF tests: - -; First make sure we get a page/lo12 pair in .text to pick up the jump-table - -; CHECK-ELF: Relocations [ -; CHECK-ELF: Section ({{[0-9]+}}) .text { -; CHECK-ELF-NEXT: 0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 .rodata -; CHECK-ELF-NEXT: 0x{{[0-9,A-F]+}} R_AARCH64_ADD_ABS_LO12_NC .rodata -; CHECK-ELF: } - -; Also check the targets in .rodata are relocated -; CHECK-ELF: Section ({{[0-9]+}}) .rodata { -; CHECK-ELF-NEXT: 0x{{[0-9,A-F]+}} R_AARCH64_ABS64 .text -; CHECK-ELF: } -; CHECK-ELF: ] diff --git a/test/CodeGen/AArch64/large-consts.ll b/test/CodeGen/AArch64/large-consts.ll new file mode 100644 index 0000000000000..1b769c6e350d3 --- /dev/null +++ b/test/CodeGen/AArch64/large-consts.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s + +; Make sure the shift amount is encoded into the instructions by LLVM because +; it's not the linker's job to put it there. + +define double @foo() { +; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0 // encoding: [A,A,0xe0'A',0xd2'A'] +; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [A,A,0xc0'A',0xf2'A'] +; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [A,A,0xa0'A',0xf2'A'] +; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [A,A,0x80'A',0xf2'A'] + + ret double 3.14159 +} diff --git a/test/CodeGen/AArch64/large-frame.ll b/test/CodeGen/AArch64/large-frame.ll index 2b2e1295c4f69..fde3036aef4ab 100644 --- a/test/CodeGen/AArch64/large-frame.ll +++ b/test/CodeGen/AArch64/large-frame.ll @@ -4,17 +4,21 @@ declare void @use_addr(i8*) @addr = global i8* null define void @test_bigframe() { -; CHECK: test_bigframe: +; CHECK-LABEL: test_bigframe: +; CHECK: .cfi_startproc %var1 = alloca i8, i32 20000000 %var2 = alloca i8, i32 16 %var3 = alloca i8, i32 20000000 ; CHECK: sub sp, sp, #496 +; CHECK: .cfi_def_cfa sp, 496 ; CHECK: str x30, [sp, #488] ; Total adjust is 39999536 ; CHECK: movz [[SUBCONST:x[0-9]+]], #22576 ; CHECK: movk [[SUBCONST]], #610, lsl #16 ; CHECK: sub sp, sp, [[SUBCONST]] +; CHECK: .cfi_def_cfa sp, 40000032 +; CHECK: .cfi_offset x30, -8 ; Total offset is 20000024 ; CHECK: movz [[VAR1OFFSET:x[0-9]+]], #11544 @@ -41,11 +45,12 @@ define void @test_bigframe() { ; CHECK: movz [[ADDCONST:x[0-9]+]], #22576 ; CHECK: movk [[ADDCONST]], #610, lsl #16 ; CHECK: add sp, sp, [[ADDCONST]] +; CHECK: .cfi_endproc ret void } define void @test_mediumframe() { -; CHECK: test_mediumframe: +; CHECK-LABEL: test_mediumframe: %var1 = alloca i8, i32 1000000 %var2 = alloca i8, i32 16 %var3 = alloca i8, i32 1000000 @@ -88,7 +93,7 @@ define void @test_mediumframe() { ; If temporary registers are allocated for adjustment, they should *not* clobber ; argument registers. define void @test_tempallocation([8 x i64] %val) nounwind { -; CHECK: test_tempallocation: +; CHECK-LABEL: test_tempallocation: %var = alloca i8, i32 1000000 ; CHECK: sub sp, sp, diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll index 45935129fd7e0..db30fd915fb05 100644 --- a/test/CodeGen/AArch64/ldst-regoffset.ll +++ b/test/CodeGen/AArch64/ldst-regoffset.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s @var_8bit = global i8 0 @var_16bit = global i16 0 @@ -9,7 +10,7 @@ @var_double = global double 0.0 define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) { -; CHECK: ldst_8bit: +; CHECK-LABEL: ldst_8bit: %addr8_sxtw = getelementptr i8* %base, i32 %off32 %val8_sxtw = load volatile i8* %addr8_sxtw @@ -37,7 +38,7 @@ define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) { define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) { -; CHECK: ldst_16bit: +; CHECK-LABEL: ldst_16bit: %addr8_sxtwN = getelementptr i16* %base, i32 %off32 %val8_sxtwN = load volatile i16* %addr8_sxtwN @@ -91,7 +92,7 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) { } define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) { -; CHECK: ldst_32bit: +; CHECK-LABEL: ldst_32bit: %addr_sxtwN = getelementptr i32* %base, i32 %off32 %val_sxtwN = load volatile i32* %addr_sxtwN @@ -143,7 +144,7 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) { } define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) { -; CHECK: ldst_64bit: +; CHECK-LABEL: ldst_64bit: %addr_sxtwN = getelementptr i64* %base, i32 %off32 %val_sxtwN = load volatile i64* %addr_sxtwN @@ -191,17 +192,19 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) { } define void @ldst_float(float* %base, i32 %off32, i64 %off64) { -; CHECK: ldst_float: +; CHECK-LABEL: ldst_float: %addr_sxtwN = getelementptr float* %base, i32 %off32 %val_sxtwN = load volatile float* %addr_sxtwN store volatile float %val_sxtwN, float* @var_float ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, %addr_lslN = getelementptr float* %base, i64 %off64 %val_lslN = load volatile float* %addr_lslN store volatile float %val_lslN, float* @var_float ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #2] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, %addrint_uxtw = ptrtoint float* %base to i64 %offset_uxtw = zext i32 %off32 to i64 @@ -210,6 +213,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) { %val_uxtw = load volatile float* %addr_uxtw store volatile float %val_uxtw, float* @var_float ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, %base_sxtw = ptrtoint float* %base to i64 %offset_sxtw = sext i32 %off32 to i64 @@ -218,6 +222,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) { %val64_sxtw = load volatile float* %addr_sxtw store volatile float %val64_sxtw, float* @var_float ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, %base_lsl = ptrtoint float* %base to i64 %addrint_lsl = add i64 %base_lsl, %off64 @@ -225,6 +230,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) { %val64_lsl = load volatile float* %addr_lsl store volatile float %val64_lsl, float* @var_float ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, %base_uxtwN = ptrtoint float* %base to i64 %offset_uxtwN = zext i32 %off32 to i64 @@ -234,21 +240,24 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) { %val64 = load volatile float* @var_float store volatile float %val64, float* %addr_uxtwN ; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, ret void } define void @ldst_double(double* %base, i32 %off32, i64 %off64) { -; CHECK: ldst_double: +; CHECK-LABEL: ldst_double: %addr_sxtwN = getelementptr double* %base, i32 %off32 %val_sxtwN = load volatile double* %addr_sxtwN store volatile double %val_sxtwN, double* @var_double ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, %addr_lslN = getelementptr double* %base, i64 %off64 %val_lslN = load volatile double* %addr_lslN store volatile double %val_lslN, double* @var_double ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #3] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, %addrint_uxtw = ptrtoint double* %base to i64 %offset_uxtw = zext i32 %off32 to i64 @@ -257,6 +266,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) { %val_uxtw = load volatile double* %addr_uxtw store volatile double %val_uxtw, double* @var_double ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, %base_sxtw = ptrtoint double* %base to i64 %offset_sxtw = sext i32 %off32 to i64 @@ -265,6 +275,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) { %val64_sxtw = load volatile double* %addr_sxtw store volatile double %val64_sxtw, double* @var_double ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, %base_lsl = ptrtoint double* %base to i64 %addrint_lsl = add i64 %base_lsl, %off64 @@ -272,6 +283,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) { %val64_lsl = load volatile double* %addr_lsl store volatile double %val64_lsl, double* @var_double ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, %base_uxtwN = ptrtoint double* %base to i64 %offset_uxtwN = zext i32 %off32 to i64 @@ -281,22 +293,25 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) { %val64 = load volatile double* @var_double store volatile double %val64, double* %addr_uxtwN ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, ret void } define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) { -; CHECK: ldst_128bit: +; CHECK-LABEL: ldst_128bit: %addr_sxtwN = getelementptr fp128* %base, i32 %off32 %val_sxtwN = load volatile fp128* %addr_sxtwN store volatile fp128 %val_sxtwN, fp128* %base ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4] +; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4] %addr_lslN = getelementptr fp128* %base, i64 %off64 %val_lslN = load volatile fp128* %addr_lslN store volatile fp128 %val_lslN, fp128* %base ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #4] +; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4] %addrint_uxtw = ptrtoint fp128* %base to i64 %offset_uxtw = zext i32 %off32 to i64 @@ -305,6 +320,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) { %val_uxtw = load volatile fp128* %addr_uxtw store volatile fp128 %val_uxtw, fp128* %base ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw] +; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4] %base_sxtw = ptrtoint fp128* %base to i64 %offset_sxtw = sext i32 %off32 to i64 @@ -313,6 +329,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) { %val64_sxtw = load volatile fp128* %addr_sxtw store volatile fp128 %val64_sxtw, fp128* %base ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw] +; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4] %base_lsl = ptrtoint fp128* %base to i64 %addrint_lsl = add i64 %base_lsl, %off64 @@ -320,6 +337,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) { %val64_lsl = load volatile fp128* %addr_lsl store volatile fp128 %val64_lsl, fp128* %base ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}] +; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4] %base_uxtwN = ptrtoint fp128* %base to i64 %offset_uxtwN = zext i32 %off32 to i64 @@ -329,5 +347,6 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) { %val64 = load volatile fp128* %base store volatile fp128 %val64, fp128* %addr_uxtwN ; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #4] +; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4] ret void } diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll index 78a3c83c3dd8d..bea5bb5d6dd68 100644 --- a/test/CodeGen/AArch64/ldst-unscaledimm.ll +++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s @var_8bit = global i8 0 @var_16bit = global i16 0 @@ -11,7 +12,7 @@ @varptr = global i8* null define void @ldst_8bit() { -; CHECK: ldst_8bit: +; CHECK-LABEL: ldst_8bit: ; No architectural support for loads to 16-bit or 8-bit since we ; promote i8 during lowering. @@ -72,7 +73,7 @@ define void @ldst_8bit() { } define void @ldst_16bit() { -; CHECK: ldst_16bit: +; CHECK-LABEL: ldst_16bit: ; No architectural support for loads to 16-bit or 16-bit since we ; promote i16 during lowering. @@ -140,7 +141,7 @@ define void @ldst_16bit() { } define void @ldst_32bit() { -; CHECK: ldst_32bit: +; CHECK-LABEL: ldst_32bit: %addr_8bit = load i8** @varptr @@ -186,7 +187,7 @@ define void @ldst_32bit() { } define void @ldst_float() { -; CHECK: ldst_float: +; CHECK-LABEL: ldst_float: %addr_8bit = load i8** @varptr %addrfp_8 = getelementptr i8* %addr_8bit, i64 -5 @@ -194,15 +195,17 @@ define void @ldst_float() { %valfp = load volatile float* %addrfp ; CHECK: ldur {{s[0-9]+}}, [{{x[0-9]+}}, #-5] +; CHECK-NOFP-NOT: ldur {{s[0-9]+}}, store volatile float %valfp, float* %addrfp ; CHECK: stur {{s[0-9]+}}, [{{x[0-9]+}}, #-5] +; CHECK-NOFP-NOT: stur {{s[0-9]+}}, ret void } define void @ldst_double() { -; CHECK: ldst_double: +; CHECK-LABEL: ldst_double: %addr_8bit = load i8** @varptr %addrfp_8 = getelementptr i8* %addr_8bit, i64 4 @@ -210,9 +213,11 @@ define void @ldst_double() { %valfp = load volatile double* %addrfp ; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #4] +; CHECK-NOFP-NOT: ldur {{d[0-9]+}}, store volatile double %valfp, double* %addrfp ; CHECK: stur {{d[0-9]+}}, [{{x[0-9]+}}, #4] +; CHECK-NOFP-NOT: stur {{d[0-9]+}}, ret void } diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll index 1e7540d9be0ac..44c1586e1ec72 100644 --- a/test/CodeGen/AArch64/ldst-unsignedimm.ll +++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s @var_8bit = global i8 0 @var_16bit = global i16 0 @@ -9,7 +10,7 @@ @var_double = global double 0.0 define void @ldst_8bit() { -; CHECK: ldst_8bit: +; CHECK-LABEL: ldst_8bit: ; No architectural support for loads to 16-bit or 8-bit since we ; promote i8 during lowering. @@ -63,7 +64,7 @@ define void @ldst_8bit() { } define void @ldst_16bit() { -; CHECK: ldst_16bit: +; CHECK-LABEL: ldst_16bit: ; No architectural support for load volatiles to 16-bit promote i16 during ; lowering. @@ -117,7 +118,7 @@ define void @ldst_16bit() { } define void @ldst_32bit() { -; CHECK: ldst_32bit: +; CHECK-LABEL: ldst_32bit: ; Straight 32-bit load/store %val32_noext = load volatile i32* @var_32bit @@ -225,27 +226,31 @@ define void @ldst_complex_offsets() { } define void @ldst_float() { -; CHECK: ldst_float: +; CHECK-LABEL: ldst_float: %valfp = load volatile float* @var_float ; CHECK: adrp {{x[0-9]+}}, var_float ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, store volatile float %valfp, float* @var_float ; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float] +; CHECK-NOFP-NOT: str {{s[0-9]+}}, ret void } define void @ldst_double() { -; CHECK: ldst_double: +; CHECK-LABEL: ldst_double: %valfp = load volatile double* @var_double ; CHECK: adrp {{x[0-9]+}}, var_double ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, store volatile double %valfp, double* @var_double ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double] +; CHECK-NOFP-NOT: str {{d[0-9]+}}, ret void } diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg index c5ce2411ed48c..9a66a00189eac 100644 --- a/test/CodeGen/AArch64/lit.local.cfg +++ b/test/CodeGen/AArch64/lit.local.cfg @@ -1,5 +1,3 @@ -config.suffixes = ['.ll', '.c', '.cpp'] - targets = set(config.root.targets_to_build.split()) if not 'AArch64' in targets: config.unsupported = True diff --git a/test/CodeGen/AArch64/literal_pools.ll b/test/CodeGen/AArch64/literal_pools.ll index 9cfa8c5426e44..fc33aee10d841 100644 --- a/test/CodeGen/AArch64/literal_pools.ll +++ b/test/CodeGen/AArch64/literal_pools.ll @@ -1,11 +1,13 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s @var32 = global i32 0 @var64 = global i64 0 define void @foo() { -; CHECK: foo: +; CHECK-LABEL: foo: %val32 = load i32* @var32 %val64 = load i64* @var64 @@ -60,13 +62,13 @@ define void @foo() { @vardouble = global double 0.0 define void @floating_lits() { -; CHECK: floating_lits: +; CHECK-LABEL: floating_lits: %floatval = load float* @varfloat %newfloat = fadd float %floatval, 128.0 ; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]] -; CHECK: ldr {{s[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]] -; CHECK: fadd +; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]] +; CHECK-NOFP-NOT: ldr {{s[0-9]+}}, ; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]] ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]] @@ -74,20 +76,26 @@ define void @floating_lits() { ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]] ; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]] ; CHECK-LARGE: fadd +; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}}, +; CHECK-NOFP-LARGE-NOT: fadd store float %newfloat, float* @varfloat %doubleval = load double* @vardouble %newdouble = fadd double %doubleval, 129.0 ; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]] -; CHECK: ldr {{d[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]] -; CHECK: fadd +; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]] +; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, [[LIT128]] +; CHECK: fadd {{d[0-9]+}}, {{d[0-9]+}}, [[LIT129]] +; CHECK-NOFP-NOT: ldr {{d[0-9]+}}, +; CHECK-NOFP-NOT: fadd ; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]] ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]] ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]] ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]] ; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]] +; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}}, store double %newdouble, double* @vardouble diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll index 5cbf5a37ec541..b5cef859e35fc 100644 --- a/test/CodeGen/AArch64/local_vars.ll +++ b/test/CodeGen/AArch64/local_vars.ll @@ -24,7 +24,7 @@ define void @trivial_func() nounwind { } define void @trivial_fp_func() { -; CHECK-WITHFP: trivial_fp_func: +; CHECK-WITHFP-LABEL: trivial_fp_func: ; CHECK-WITHFP: sub sp, sp, #16 ; CHECK-WITHFP: stp x29, x30, [sp] @@ -43,7 +43,7 @@ define void @trivial_fp_func() { define void @stack_local() { %local_var = alloca i64 -; CHECK: stack_local: +; CHECK-LABEL: stack_local: ; CHECK: sub sp, sp, #16 %val = load i64* @var diff --git a/test/CodeGen/AArch64/logical-imm.ll b/test/CodeGen/AArch64/logical-imm.ll index 5f3f4da0cdada..e04bb510ebf20 100644 --- a/test/CodeGen/AArch64/logical-imm.ll +++ b/test/CodeGen/AArch64/logical-imm.ll @@ -4,7 +4,7 @@ @var64 = global i64 0 define void @test_and(i32 %in32, i64 %in64) { -; CHECK: test_and: +; CHECK-LABEL: test_and: %val0 = and i32 %in32, 2863311530 store volatile i32 %val0, i32* @var32 @@ -26,7 +26,7 @@ define void @test_and(i32 %in32, i64 %in64) { } define void @test_orr(i32 %in32, i64 %in64) { -; CHECK: test_orr: +; CHECK-LABEL: test_orr: %val0 = or i32 %in32, 2863311530 store volatile i32 %val0, i32* @var32 @@ -48,7 +48,7 @@ define void @test_orr(i32 %in32, i64 %in64) { } define void @test_eor(i32 %in32, i64 %in64) { -; CHECK: test_eor: +; CHECK-LABEL: test_eor: %val0 = xor i32 %in32, 2863311530 store volatile i32 %val0, i32* @var32 @@ -70,7 +70,7 @@ define void @test_eor(i32 %in32, i64 %in64) { } define void @test_mov(i32 %in32, i64 %in64) { -; CHECK: test_mov: +; CHECK-LABEL: test_mov: %val0 = add i32 %in32, 2863311530 store i32 %val0, i32* @var32 ; CHECK: orr {{w[0-9]+}}, wzr, #0xaaaaaaaa diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll index bbbfcc1b9118e..a08ba20c7f119 100644 --- a/test/CodeGen/AArch64/logical_shifted_reg.ll +++ b/test/CodeGen/AArch64/logical_shifted_reg.ll @@ -7,7 +7,7 @@ @var2_64 = global i64 0 define void @logical_32bit() { -; CHECK: logical_32bit: +; CHECK-LABEL: logical_32bit: %val1 = load i32* @var1_32 %val2 = load i32* @var2_32 @@ -97,7 +97,7 @@ define void @logical_32bit() { } define void @logical_64bit() { -; CHECK: logical_64bit: +; CHECK-LABEL: logical_64bit: %val1 = load i64* @var1_64 %val2 = load i64* @var2_64 @@ -190,7 +190,7 @@ define void @logical_64bit() { } define void @flag_setting() { -; CHECK: flag_setting: +; CHECK-LABEL: flag_setting: %val1 = load i64* @var1_64 %val2 = load i64* @var2_64 diff --git a/test/CodeGen/AArch64/logical_shifted_reg.s b/test/CodeGen/AArch64/logical_shifted_reg.s deleted file mode 100644 index 89aea580119bd..0000000000000 --- a/test/CodeGen/AArch64/logical_shifted_reg.s +++ /dev/null @@ -1,208 +0,0 @@ - .file "/home/timnor01/a64-trunk/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll" - .text - .globl logical_32bit - .type logical_32bit,@function -logical_32bit: // @logical_32bit - .cfi_startproc -// BB#0: - adrp x0, var1_32 - ldr w1, [x0, #:lo12:var1_32] - adrp x0, var2_32 - ldr w2, [x0, #:lo12:var2_32] - and w3, w1, w2 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - bic w3, w1, w2 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - orr w3, w1, w2 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - orn w3, w1, w2 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - eor w3, w1, w2 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - eon w3, w2, w1 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - and w3, w1, w2, lsl #31 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - bic w3, w1, w2, lsl #31 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - orr w3, w1, w2, lsl #31 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - orn w3, w1, w2, lsl #31 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - eor w3, w1, w2, lsl #31 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - eon w3, w1, w2, lsl #31 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - bic w3, w1, w2, asr #10 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - eor w3, w1, w2, asr #10 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - orn w3, w1, w2, lsr #1 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - eor w3, w1, w2, lsr #1 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - eon w3, w1, w2, ror #20 - adrp x0, var1_32 - str w3, [x0, #:lo12:var1_32] - and w1, w1, w2, ror #20 - adrp x0, var1_32 - str w1, [x0, #:lo12:var1_32] - ret -.Ltmp0: - .size logical_32bit, .Ltmp0-logical_32bit - .cfi_endproc - - .globl logical_64bit - .type logical_64bit,@function -logical_64bit: // @logical_64bit - .cfi_startproc -// BB#0: - adrp x0, var1_64 - ldr x0, [x0, #:lo12:var1_64] - adrp x1, var2_64 - ldr x1, [x1, #:lo12:var2_64] - and x2, x0, x1 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - bic x2, x0, x1 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - orr x2, x0, x1 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - orn x2, x0, x1 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - eor x2, x0, x1 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - eon x2, x1, x0 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - and x2, x0, x1, lsl #63 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - bic x2, x0, x1, lsl #63 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - orr x2, x0, x1, lsl #63 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - orn x2, x0, x1, lsl #63 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - eor x2, x0, x1, lsl #63 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - eon x2, x0, x1, lsl #63 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - bic x2, x0, x1, asr #10 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - eor x2, x0, x1, asr #10 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - orn x2, x0, x1, lsr #1 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - eor x2, x0, x1, lsr #1 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - eon x2, x0, x1, ror #20 - adrp x3, var1_64 - str x2, [x3, #:lo12:var1_64] - and x0, x0, x1, ror #20 - adrp x1, var1_64 - str x0, [x1, #:lo12:var1_64] - ret -.Ltmp1: - .size logical_64bit, .Ltmp1-logical_64bit - .cfi_endproc - - .globl flag_setting - .type flag_setting,@function -flag_setting: // @flag_setting - .cfi_startproc -// BB#0: - sub sp, sp, #16 - adrp x0, var1_64 - ldr x0, [x0, #:lo12:var1_64] - adrp x1, var2_64 - ldr x1, [x1, #:lo12:var2_64] - tst x0, x1 - str x0, [sp, #8] // 8-byte Folded Spill - str x1, [sp] // 8-byte Folded Spill - b.gt .LBB2_4 - b .LBB2_1 -.LBB2_1: // %test2 - ldr x0, [sp, #8] // 8-byte Folded Reload - ldr x1, [sp] // 8-byte Folded Reload - tst x0, x1, lsl #63 - b.lt .LBB2_4 - b .LBB2_2 -.LBB2_2: // %test3 - ldr x0, [sp, #8] // 8-byte Folded Reload - ldr x1, [sp] // 8-byte Folded Reload - tst x0, x1, asr #12 - b.gt .LBB2_4 - b .LBB2_3 -.LBB2_3: // %other_exit - adrp x0, var1_64 - ldr x1, [sp, #8] // 8-byte Folded Reload - str x1, [x0, #:lo12:var1_64] - add sp, sp, #16 - ret -.LBB2_4: // %ret - add sp, sp, #16 - ret -.Ltmp2: - .size flag_setting, .Ltmp2-flag_setting - .cfi_endproc - - .type var1_32,@object // @var1_32 - .bss - .globl var1_32 - .align 2 -var1_32: - .word 0 // 0x0 - .size var1_32, 4 - - .type var2_32,@object // @var2_32 - .globl var2_32 - .align 2 -var2_32: - .word 0 // 0x0 - .size var2_32, 4 - - .type var1_64,@object // @var1_64 - .globl var1_64 - .align 3 -var1_64: - .xword 0 // 0x0 - .size var1_64, 8 - - .type var2_64,@object // @var2_64 - .globl var2_64 - .align 3 -var2_64: - .xword 0 // 0x0 - .size var2_64, 8 - - diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll index b8a5fb9322021..38e37db7b58cc 100644 --- a/test/CodeGen/AArch64/movw-consts.ll +++ b/test/CodeGen/AArch64/movw-consts.ll @@ -1,50 +1,50 @@ ; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s define i64 @test0() { -; CHECK: test0: +; CHECK-LABEL: test0: ; Not produced by move wide instructions, but good to make sure we can return 0 anyway: ; CHECK: mov x0, xzr ret i64 0 } define i64 @test1() { -; CHECK: test1: +; CHECK-LABEL: test1: ; CHECK: movz x0, #1 ret i64 1 } define i64 @test2() { -; CHECK: test2: +; CHECK-LABEL: test2: ; CHECK: movz x0, #65535 ret i64 65535 } define i64 @test3() { -; CHECK: test3: +; CHECK-LABEL: test3: ; CHECK: movz x0, #1, lsl #16 ret i64 65536 } define i64 @test4() { -; CHECK: test4: +; CHECK-LABEL: test4: ; CHECK: movz x0, #65535, lsl #16 ret i64 4294901760 } define i64 @test5() { -; CHECK: test5: +; CHECK-LABEL: test5: ; CHECK: movz x0, #1, lsl #32 ret i64 4294967296 } define i64 @test6() { -; CHECK: test6: +; CHECK-LABEL: test6: ; CHECK: movz x0, #65535, lsl #32 ret i64 281470681743360 } define i64 @test7() { -; CHECK: test7: +; CHECK-LABEL: test7: ; CHECK: movz x0, #1, lsl #48 ret i64 281474976710656 } @@ -52,19 +52,19 @@ define i64 @test7() { ; A 32-bit MOVN can generate some 64-bit patterns that a 64-bit one ; couldn't. Useful even for i64 define i64 @test8() { -; CHECK: test8: +; CHECK-LABEL: test8: ; CHECK: movn w0, #60875 ret i64 4294906420 } define i64 @test9() { -; CHECK: test9: +; CHECK-LABEL: test9: ; CHECK: movn x0, #0 ret i64 -1 } define i64 @test10() { -; CHECK: test10: +; CHECK-LABEL: test10: ; CHECK: movn x0, #60875, lsl #16 ret i64 18446744069720047615 } @@ -74,49 +74,49 @@ define i64 @test10() { @var32 = global i32 0 define void @test11() { -; CHECK: test11: +; CHECK-LABEL: test11: ; CHECK: mov {{w[0-9]+}}, wzr store i32 0, i32* @var32 ret void } define void @test12() { -; CHECK: test12: +; CHECK-LABEL: test12: ; CHECK: movz {{w[0-9]+}}, #1 store i32 1, i32* @var32 ret void } define void @test13() { -; CHECK: test13: +; CHECK-LABEL: test13: ; CHECK: movz {{w[0-9]+}}, #65535 store i32 65535, i32* @var32 ret void } define void @test14() { -; CHECK: test14: +; CHECK-LABEL: test14: ; CHECK: movz {{w[0-9]+}}, #1, lsl #16 store i32 65536, i32* @var32 ret void } define void @test15() { -; CHECK: test15: +; CHECK-LABEL: test15: ; CHECK: movz {{w[0-9]+}}, #65535, lsl #16 store i32 4294901760, i32* @var32 ret void } define void @test16() { -; CHECK: test16: +; CHECK-LABEL: test16: ; CHECK: movn {{w[0-9]+}}, #0 store i32 -1, i32* @var32 ret void } define i64 @test17() { -; CHECK: test17: +; CHECK-LABEL: test17: ; Mustn't MOVN w0 here. ; CHECK: movn x0, #2 diff --git a/test/CodeGen/AArch64/movw-shift-encoding.ll b/test/CodeGen/AArch64/movw-shift-encoding.ll new file mode 100644 index 0000000000000..ec133bd706b11 --- /dev/null +++ b/test/CodeGen/AArch64/movw-shift-encoding.ll @@ -0,0 +1,14 @@ +; RUN: llc -mtriple=aarch64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s + +@var = global i32 0 + +; CodeGen should ensure that the correct shift bits are set, because the linker +; isn't going to! + +define i32* @get_var() { + ret i32* @var +; CHECK: movz x0, #:abs_g3:var // encoding: [A,A,0xe0'A',0xd2'A'] +; CHECK: movk x0, #:abs_g2_nc:var // encoding: [A,A,0xc0'A',0xf2'A'] +; CHECK: movk x0, #:abs_g1_nc:var // encoding: [A,A,0xa0'A',0xf2'A'] +; CHECK: movk x0, #:abs_g0_nc:var // encoding: [A,A,0x80'A',0xf2'A'] +} diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/neon-2velem-high.ll new file mode 100644 index 0000000000000..97031d98b7c03 --- /dev/null +++ b/test/CodeGen/AArch64/neon-2velem-high.ll @@ -0,0 +1,331 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) + +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) + +define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) { +; CHECK: test_vmull_high_n_s16: +; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vmull15.i.i +} + +define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) { +; CHECK: test_vmull_high_n_s32: +; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vmull9.i.i +} + +define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) { +; CHECK: test_vmull_high_n_u16: +; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vmull15.i.i +} + +define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) { +; CHECK: test_vmull_high_n_u32: +; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vmull9.i.i +} + +define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) { +; CHECK: test_vqdmull_high_n_s16: +; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vqdmull15.i.i +} + +define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) { +; CHECK: test_vqdmull_high_n_s32: +; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vqdmull9.i.i +} + +define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlal_high_n_s16: +; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlal_high_n_s32: +; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlal_high_n_u16: +; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlal_high_n_u32: +; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vqdmlal_high_n_s16: +; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i) + ret <4 x i32> %vqdmlal17.i.i +} + +define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vqdmlal_high_n_s32: +; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i) + ret <2 x i64> %vqdmlal11.i.i +} + +define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlsl_high_n_s16: +; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlsl_high_n_s32: +; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlsl_high_n_u16: +; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlsl_high_n_u32: +; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vqdmlsl_high_n_s16: +; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i) + ret <4 x i32> %vqdmlsl17.i.i +} + +define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vqdmlsl_high_n_s32: +; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i) + ret <2 x i64> %vqdmlsl11.i.i +} + +define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) { +; CHECK: test_vmul_n_f32: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %vecinit.i = insertelement <2 x float> undef, float %b, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1 + %mul.i = fmul <2 x float> %vecinit1.i, %a + ret <2 x float> %mul.i +} + +define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) { +; CHECK: test_vmulq_n_f32: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %vecinit.i = insertelement <4 x float> undef, float %b, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3 + %mul.i = fmul <4 x float> %vecinit3.i, %a + ret <4 x float> %mul.i +} + +define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) { +; CHECK: test_vmulq_n_f64: +; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %vecinit.i = insertelement <2 x double> undef, double %b, i32 0 + %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1 + %mul.i = fmul <2 x double> %vecinit1.i, %a + ret <2 x double> %mul.i +} + +define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) { +; CHECK: test_vfma_n_f32: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) { +; CHECK: test_vfmaq_n_f32: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) { +; CHECK: test_vfms_n_f32: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 + %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b + %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a) + ret <2 x float> %1 +} + +define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) { +; CHECK: test_vfmsq_n_f32: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 + %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a) + ret <4 x float> %1 +} diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/neon-2velem.ll new file mode 100644 index 0000000000000..9d6184243713d --- /dev/null +++ b/test/CodeGen/AArch64/neon-2velem.ll @@ -0,0 +1,2550 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>) + +declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>) + +declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>) + +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) + +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) + +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) + +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) + +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmla_lane_s16: +; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i16> %shuffle, %b + %add = add <4 x i16> %mul, %a + ret <4 x i16> %add +} + +define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlaq_lane_s16: +; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %mul = mul <8 x i16> %shuffle, %b + %add = add <8 x i16> %mul, %a + ret <8 x i16> %add +} + +define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmla_lane_s32: +; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %mul = mul <2 x i32> %shuffle, %b + %add = add <2 x i32> %mul, %a + ret <2 x i32> %add +} + +define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlaq_lane_s32: +; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %mul = mul <4 x i32> %shuffle, %b + %add = add <4 x i32> %mul, %a + ret <4 x i32> %add +} + +define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmla_laneq_s16: +; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %mul = mul <4 x i16> %shuffle, %b + %add = add <4 x i16> %mul, %a + ret <4 x i16> %add +} + +define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlaq_laneq_s16: +; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> + %mul = mul <8 x i16> %shuffle, %b + %add = add <8 x i16> %mul, %a + ret <8 x i16> %add +} + +define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmla_laneq_s32: +; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %mul = mul <2 x i32> %shuffle, %b + %add = add <2 x i32> %mul, %a + ret <2 x i32> %add +} + +define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlaq_laneq_s32: +; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i32> %shuffle, %b + %add = add <4 x i32> %mul, %a + ret <4 x i32> %add +} + +define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmls_lane_s16: +; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i16> %shuffle, %b + %sub = sub <4 x i16> %a, %mul + ret <4 x i16> %sub +} + +define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsq_lane_s16: +; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %mul = mul <8 x i16> %shuffle, %b + %sub = sub <8 x i16> %a, %mul + ret <8 x i16> %sub +} + +define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmls_lane_s32: +; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %mul = mul <2 x i32> %shuffle, %b + %sub = sub <2 x i32> %a, %mul + ret <2 x i32> %sub +} + +define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsq_lane_s32: +; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %mul = mul <4 x i32> %shuffle, %b + %sub = sub <4 x i32> %a, %mul + ret <4 x i32> %sub +} + +define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmls_laneq_s16: +; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %mul = mul <4 x i16> %shuffle, %b + %sub = sub <4 x i16> %a, %mul + ret <4 x i16> %sub +} + +define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsq_laneq_s16: +; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> + %mul = mul <8 x i16> %shuffle, %b + %sub = sub <8 x i16> %a, %mul + ret <8 x i16> %sub +} + +define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmls_laneq_s32: +; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %mul = mul <2 x i32> %shuffle, %b + %sub = sub <2 x i32> %a, %mul + ret <2 x i32> %sub +} + +define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsq_laneq_s32: +; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i32> %shuffle, %b + %sub = sub <4 x i32> %a, %mul + ret <4 x i32> %sub +} + +define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmul_lane_s16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmulq_lane_s16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmul_lane_s32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmulq_lane_s32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmul_lane_u16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmulq_lane_u16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmul_lane_u32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmulq_lane_u32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmul_laneq_s16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmulq_laneq_s16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmul_laneq_s32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmulq_laneq_s32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmul_laneq_u16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmulq_laneq_u16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmul_laneq_u32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmulq_laneq_u32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfma_lane_f32: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) + +define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { +; CHECK: test_vfmaq_lane_f32: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + +define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { +; CHECK: test_vfma_laneq_f32: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3> + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { +; CHECK: test_vfmaq_laneq_f32: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfms_lane_f32: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1> + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { +; CHECK: test_vfmsq_lane_f32: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { +; CHECK: test_vfms_laneq_f32: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3> + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { +; CHECK: test_vfmsq_laneq_f32: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { +; CHECK: test_vfmaq_lane_f64: +; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) + ret <2 x double> %0 +} + +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmaq_laneq_f64: +; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +entry: + %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1> + %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) + ret <2 x double> %0 +} + +define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { +; CHECK: test_vfmsq_lane_f64: +; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %sub = fsub <1 x double> <double -0.000000e+00>, %v + %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) + ret <2 x double> %0 +} + +define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmsq_laneq_f64: +; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +entry: + %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v + %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1> + %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) + ret <2 x double> %0 +} + +define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_lane_s16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_lane_s32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_laneq_s16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_laneq_s32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_high_lane_s16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_high_lane_s32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_high_laneq_s16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_high_laneq_s32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_lane_s16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_lane_s32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_laneq_s16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_laneq_s32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_high_lane_s16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_high_lane_s32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_high_laneq_s16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_high_laneq_s32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_lane_u16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_lane_u32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_laneq_u16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_laneq_u32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_high_lane_u16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_high_lane_u32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_high_laneq_u16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_high_laneq_u32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_lane_u16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_lane_u32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_laneq_u16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_laneq_u32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_high_lane_u16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_high_lane_u32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_high_laneq_u16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_high_laneq_u32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_lane_s16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_lane_s32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_lane_u16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_lane_u32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_high_lane_s16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_high_lane_s32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_high_lane_u16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_high_lane_u32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_laneq_s16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_laneq_s32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_laneq_u16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_laneq_u32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_high_laneq_s16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_high_laneq_s32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_high_laneq_u16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_high_laneq_u32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlal_lane_s16: +; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i) + ret <4 x i32> %vqdmlal4.i +} + +define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlal_lane_s32: +; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i) + ret <2 x i64> %vqdmlal4.i +} + +define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlal_high_lane_s16: +; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i) + ret <4 x i32> %vqdmlal4.i +} + +define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlal_high_lane_s32: +; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i) + ret <2 x i64> %vqdmlal4.i +} + +define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlsl_lane_s16: +; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i) + ret <4 x i32> %vqdmlsl4.i +} + +define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlsl_lane_s32: +; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i) + ret <2 x i64> %vqdmlsl4.i +} + +define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlsl_high_lane_s16: +; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i) + ret <4 x i32> %vqdmlsl4.i +} + +define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlsl_high_lane_s32: +; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i) + ret <2 x i64> %vqdmlsl4.i +} + +define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmull_lane_s16: +; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmull_lane_s32: +; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vqdmull_laneq_s16: +; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vqdmull_laneq_s32: +; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmull_high_lane_s16: +; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmull_high_lane_s32: +; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vqdmull_high_laneq_s16: +; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vqdmull_high_laneq_s32: +; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmulh_lane_s16: +; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i16> %vqdmulh2.i +} + +define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmulhq_lane_s16: +; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) + ret <8 x i16> %vqdmulh2.i +} + +define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmulh_lane_s32: +; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i32> %vqdmulh2.i +} + +define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmulhq_lane_s32: +; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) + ret <4 x i32> %vqdmulh2.i +} + +define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqrdmulh_lane_s16: +; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i16> %vqrdmulh2.i +} + +define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqrdmulhq_lane_s16: +; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) + ret <8 x i16> %vqrdmulh2.i +} + +define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqrdmulh_lane_s32: +; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i32> %vqrdmulh2.i +} + +define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqrdmulhq_lane_s32: +; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) + ret <4 x i32> %vqrdmulh2.i +} + +define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) { +; CHECK: test_vmul_lane_f32: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> + %mul = fmul <2 x float> %shuffle, %a + ret <2 x float> %mul +} + +define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) { +; CHECK: test_vmul_lane_f64: +; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +entry: + %0 = bitcast <1 x double> %a to <8 x i8> + %1 = bitcast <8 x i8> %0 to double + %extract = extractelement <1 x double> %v, i32 0 + %2 = fmul double %1, %extract + %3 = insertelement <1 x double> undef, double %2, i32 0 + ret <1 x double> %3 +} + +define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) { +; CHECK: test_vmulq_lane_f32: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %mul = fmul <4 x float> %shuffle, %a + ret <4 x float> %mul +} + +define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) { +; CHECK: test_vmulq_lane_f64: +; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %a + ret <2 x double> %mul +} + +define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) { +; CHECK: test_vmul_laneq_f32: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3> + %mul = fmul <2 x float> %shuffle, %a + ret <2 x float> %mul +} + +define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) { +; CHECK: test_vmul_laneq_f64: +; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] +entry: + %0 = bitcast <1 x double> %a to <8 x i8> + %1 = bitcast <8 x i8> %0 to double + %extract = extractelement <2 x double> %v, i32 1 + %2 = fmul double %1, %extract + %3 = insertelement <1 x double> undef, double %2, i32 0 + ret <1 x double> %3 +} + +define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) { +; CHECK: test_vmulq_laneq_f32: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %mul = fmul <4 x float> %shuffle, %a + ret <4 x float> %mul +} + +define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulq_laneq_f64: +; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1> + %mul = fmul <2 x double> %shuffle, %a + ret <2 x double> %mul +} + +define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) { +; CHECK: test_vmulx_lane_f32: +; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> + %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) + ret <2 x float> %vmulx2.i +} + +define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) { +; CHECK: test_vmulxq_lane_f32: +; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) + ret <4 x float> %vmulx2.i +} + +define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) { +; CHECK: test_vmulxq_lane_f64: +; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) + ret <2 x double> %vmulx2.i +} + +define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) { +; CHECK: test_vmulx_laneq_f32: +; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3> + %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) + ret <2 x float> %vmulx2.i +} + +define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) { +; CHECK: test_vmulxq_laneq_f32: +; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) + ret <4 x float> %vmulx2.i +} + +define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulxq_laneq_f64: +; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1> + %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) + ret <2 x double> %vmulx2.i +} + +define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmla_lane_s16_0: +; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %b + %add = add <4 x i16> %mul, %a + ret <4 x i16> %add +} + +define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlaq_lane_s16_0: +; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %b + %add = add <8 x i16> %mul, %a + ret <8 x i16> %add +} + +define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmla_lane_s32_0: +; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %b + %add = add <2 x i32> %mul, %a + ret <2 x i32> %add +} + +define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlaq_lane_s32_0: +; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %b + %add = add <4 x i32> %mul, %a + ret <4 x i32> %add +} + +define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmla_laneq_s16_0: +; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %b + %add = add <4 x i16> %mul, %a + ret <4 x i16> %add +} + +define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlaq_laneq_s16_0: +; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %b + %add = add <8 x i16> %mul, %a + ret <8 x i16> %add +} + +define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmla_laneq_s32_0: +; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %b + %add = add <2 x i32> %mul, %a + ret <2 x i32> %add +} + +define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlaq_laneq_s32_0: +; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %b + %add = add <4 x i32> %mul, %a + ret <4 x i32> %add +} + +define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmls_lane_s16_0: +; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %b + %sub = sub <4 x i16> %a, %mul + ret <4 x i16> %sub +} + +define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsq_lane_s16_0: +; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %b + %sub = sub <8 x i16> %a, %mul + ret <8 x i16> %sub +} + +define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmls_lane_s32_0: +; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %b + %sub = sub <2 x i32> %a, %mul + ret <2 x i32> %sub +} + +define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsq_lane_s32_0: +; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %b + %sub = sub <4 x i32> %a, %mul + ret <4 x i32> %sub +} + +define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmls_laneq_s16_0: +; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %b + %sub = sub <4 x i16> %a, %mul + ret <4 x i16> %sub +} + +define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsq_laneq_s16_0: +; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %b + %sub = sub <8 x i16> %a, %mul + ret <8 x i16> %sub +} + +define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmls_laneq_s32_0: +; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %b + %sub = sub <2 x i32> %a, %mul + ret <2 x i32> %sub +} + +define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsq_laneq_s32_0: +; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %b + %sub = sub <4 x i32> %a, %mul + ret <4 x i32> %sub +} + +define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmul_lane_s16_0: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmulq_lane_s16_0: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmul_lane_s32_0: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmulq_lane_s32_0: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmul_lane_u16_0: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmulq_lane_u16_0: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmul_lane_u32_0: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmulq_lane_u32_0: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmul_laneq_s16_0: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmulq_laneq_s16_0: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmul_laneq_s32_0: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmulq_laneq_s32_0: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmul_laneq_u16_0: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmulq_laneq_u16_0: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmul_laneq_u32_0: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmulq_laneq_u32_0: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfma_lane_f32_0: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { +; CHECK: test_vfmaq_lane_f32_0: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { +; CHECK: test_vfma_laneq_f32_0: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { +; CHECK: test_vfmaq_laneq_f32_0: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfms_lane_f32_0: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { +; CHECK: test_vfmsq_lane_f32_0: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { +; CHECK: test_vfms_laneq_f32_0: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { +; CHECK: test_vfmsq_laneq_f32_0: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v + %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmaq_laneq_f64_0: +; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) + ret <2 x double> %0 +} + +define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmsq_laneq_f64_0: +; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v + %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer + %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) + ret <2 x double> %0 +} + +define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_lane_s16_0: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_lane_s32_0: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_laneq_s16_0: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_laneq_s32_0: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_high_lane_s16_0: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_high_lane_s32_0: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_high_laneq_s16_0: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_high_laneq_s32_0: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_lane_s16_0: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_lane_s32_0: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_laneq_s16_0: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_laneq_s32_0: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_high_lane_s16_0: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_high_lane_s32_0: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_high_laneq_s16_0: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_high_laneq_s32_0: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_lane_u16_0: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_lane_u32_0: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_laneq_u16_0: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_laneq_u32_0: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_high_lane_u16_0: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_high_lane_u32_0: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_high_laneq_u16_0: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_high_laneq_u32_0: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_lane_u16_0: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_lane_u32_0: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_laneq_u16_0: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_laneq_u32_0: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_high_lane_u16_0: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_high_lane_u32_0: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_high_laneq_u16_0: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_high_laneq_u32_0: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_lane_s16_0: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_lane_s32_0: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_lane_u16_0: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_lane_u32_0: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_high_lane_s16_0: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_high_lane_s32_0: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_high_lane_u16_0: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_high_lane_u32_0: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_laneq_s16_0: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_laneq_s32_0: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_laneq_u16_0: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_laneq_u32_0: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_high_laneq_s16_0: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_high_laneq_s32_0: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_high_laneq_u16_0: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_high_laneq_u32_0: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlal_lane_s16_0: +; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i) + ret <4 x i32> %vqdmlal4.i +} + +define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlal_lane_s32_0: +; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i) + ret <2 x i64> %vqdmlal4.i +} + +define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlal_high_lane_s16_0: +; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i) + ret <4 x i32> %vqdmlal4.i +} + +define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlal_high_lane_s32_0: +; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i) + ret <2 x i64> %vqdmlal4.i +} + +define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlsl_lane_s16_0: +; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) + %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i) + ret <4 x i32> %vqdmlsl4.i +} + +define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlsl_lane_s32_0: +; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) + %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i) + ret <2 x i64> %vqdmlsl4.i +} + +define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlsl_high_lane_s16_0: +; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i) + ret <4 x i32> %vqdmlsl4.i +} + +define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlsl_high_lane_s32_0: +; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i) + ret <2 x i64> %vqdmlsl4.i +} + +define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmull_lane_s16_0: +; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmull_lane_s32_0: +; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vqdmull_laneq_s16_0: +; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vqdmull_laneq_s32_0: +; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmull_high_lane_s16_0: +; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmull_high_lane_s32_0: +; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vqdmull_high_laneq_s16_0: +; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vqdmull_high_laneq_s32_0: +; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmulh_lane_s16_0: +; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i16> %vqdmulh2.i +} + +define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmulhq_lane_s16_0: +; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer + %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) + ret <8 x i16> %vqdmulh2.i +} + +define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmulh_lane_s32_0: +; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i32> %vqdmulh2.i +} + +define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmulhq_lane_s32_0: +; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer + %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) + ret <4 x i32> %vqdmulh2.i +} + +define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqrdmulh_lane_s16_0: +; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer + %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) + ret <4 x i16> %vqrdmulh2.i +} + +define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqrdmulhq_lane_s16_0: +; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer + %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) + ret <8 x i16> %vqrdmulh2.i +} + +define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqrdmulh_lane_s32_0: +; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer + %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) + ret <2 x i32> %vqrdmulh2.i +} + +define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqrdmulhq_lane_s32_0: +; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer + %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) + ret <4 x i32> %vqrdmulh2.i +} + +define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) { +; CHECK: test_vmul_lane_f32_0: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x float> %shuffle, %a + ret <2 x float> %mul +} + +define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) { +; CHECK: test_vmulq_lane_f32_0: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer + %mul = fmul <4 x float> %shuffle, %a + ret <4 x float> %mul +} + +define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) { +; CHECK: test_vmul_laneq_f32_0: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x float> %shuffle, %a + ret <2 x float> %mul +} + +define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) { +; CHECK: test_vmul_laneq_f64_0: +; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] +entry: + %0 = bitcast <1 x double> %a to <8 x i8> + %1 = bitcast <8 x i8> %0 to double + %extract = extractelement <2 x double> %v, i32 0 + %2 = fmul double %1, %extract + %3 = insertelement <1 x double> undef, double %2, i32 0 + ret <1 x double> %3 +} + +define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { +; CHECK: test_vmulq_laneq_f32_0: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer + %mul = fmul <4 x float> %shuffle, %a + ret <4 x float> %mul +} + +define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulq_laneq_f64_0: +; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %a + ret <2 x double> %mul +} + +define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) { +; CHECK: test_vmulx_lane_f32_0: +; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer + %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) + ret <2 x float> %vmulx2.i +} + +define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) { +; CHECK: test_vmulxq_lane_f32_0: +; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer + %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) + ret <4 x float> %vmulx2.i +} + +define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) { +; CHECK: test_vmulxq_lane_f64_0: +; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) + ret <2 x double> %vmulx2.i +} + +define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) { +; CHECK: test_vmulx_laneq_f32_0: +; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer + %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) + ret <2 x float> %vmulx2.i +} + +define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { +; CHECK: test_vmulxq_laneq_f32_0: +; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer + %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) + ret <4 x float> %vmulx2.i +} + +define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulxq_laneq_f64_0: +; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) + ret <2 x double> %vmulx2.i +} + diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/neon-3vdiff.ll new file mode 100644 index 0000000000000..171e2b2edad06 --- /dev/null +++ b/test/CodeGen/AArch64/neon-3vdiff.ll @@ -0,0 +1,1806 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) + +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) + +declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) + +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) + +declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) + +declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) + +declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) + +declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) + +declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) + +declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) + +declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) + +declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) + +declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) + +declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) + +declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) + +declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) + +declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vaddl_s8: +; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = sext <8 x i8> %a to <8 x i16> + %vmovl.i2.i = sext <8 x i8> %b to <8 x i16> + %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vaddl_s16: +; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = sext <4 x i16> %a to <4 x i32> + %vmovl.i2.i = sext <4 x i16> %b to <4 x i32> + %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vaddl_s32: +; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = sext <2 x i32> %a to <2 x i64> + %vmovl.i2.i = sext <2 x i32> %b to <2 x i64> + %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vaddl_u8: +; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = zext <8 x i8> %a to <8 x i16> + %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> + %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vaddl_u16: +; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = zext <4 x i16> %a to <4 x i32> + %vmovl.i2.i = zext <4 x i16> %b to <4 x i32> + %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vaddl_u32: +; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = zext <2 x i32> %a to <2 x i64> + %vmovl.i2.i = zext <2 x i32> %b to <2 x i64> + %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vaddl_high_s8: +; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16> + %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16> + %add.i = add <8 x i16> %0, %1 + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vaddl_high_s16: +; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32> + %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32> + %add.i = add <4 x i32> %0, %1 + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vaddl_high_s32: +; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64> + %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64> + %add.i = add <2 x i64> %0, %1 + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vaddl_high_u8: +; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> + %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16> + %add.i = add <8 x i16> %0, %1 + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vaddl_high_u16: +; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32> + %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32> + %add.i = add <4 x i32> %0, %1 + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vaddl_high_u32: +; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64> + %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64> + %add.i = add <2 x i64> %0, %1 + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) { +; CHECK: test_vaddw_s8: +; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = sext <8 x i8> %b to <8 x i16> + %add.i = add <8 x i16> %vmovl.i.i, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) { +; CHECK: test_vaddw_s16: +; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = sext <4 x i16> %b to <4 x i32> + %add.i = add <4 x i32> %vmovl.i.i, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) { +; CHECK: test_vaddw_s32: +; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = sext <2 x i32> %b to <2 x i64> + %add.i = add <2 x i64> %vmovl.i.i, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) { +; CHECK: test_vaddw_u8: +; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = zext <8 x i8> %b to <8 x i16> + %add.i = add <8 x i16> %vmovl.i.i, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) { +; CHECK: test_vaddw_u16: +; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = zext <4 x i16> %b to <4 x i32> + %add.i = add <4 x i32> %vmovl.i.i, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) { +; CHECK: test_vaddw_u32: +; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = zext <2 x i32> %b to <2 x i64> + %add.i = add <2 x i64> %vmovl.i.i, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) { +; CHECK: test_vaddw_high_s8: +; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16> + %add.i = add <8 x i16> %0, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) { +; CHECK: test_vaddw_high_s16: +; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32> + %add.i = add <4 x i32> %0, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) { +; CHECK: test_vaddw_high_s32: +; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64> + %add.i = add <2 x i64> %0, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) { +; CHECK: test_vaddw_high_u8: +; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> + %add.i = add <8 x i16> %0, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) { +; CHECK: test_vaddw_high_u16: +; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32> + %add.i = add <4 x i32> %0, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) { +; CHECK: test_vaddw_high_u32: +; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64> + %add.i = add <2 x i64> %0, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsubl_s8: +; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = sext <8 x i8> %a to <8 x i16> + %vmovl.i2.i = sext <8 x i8> %b to <8 x i16> + %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsubl_s16: +; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = sext <4 x i16> %a to <4 x i32> + %vmovl.i2.i = sext <4 x i16> %b to <4 x i32> + %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsubl_s32: +; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = sext <2 x i32> %a to <2 x i64> + %vmovl.i2.i = sext <2 x i32> %b to <2 x i64> + %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsubl_u8: +; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = zext <8 x i8> %a to <8 x i16> + %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> + %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsubl_u16: +; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = zext <4 x i16> %a to <4 x i32> + %vmovl.i2.i = zext <4 x i16> %b to <4 x i32> + %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsubl_u32: +; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = zext <2 x i32> %a to <2 x i64> + %vmovl.i2.i = zext <2 x i32> %b to <2 x i64> + %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsubl_high_s8: +; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16> + %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16> + %sub.i = sub <8 x i16> %0, %1 + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsubl_high_s16: +; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32> + %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32> + %sub.i = sub <4 x i32> %0, %1 + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsubl_high_s32: +; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64> + %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64> + %sub.i = sub <2 x i64> %0, %1 + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsubl_high_u8: +; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> + %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16> + %sub.i = sub <8 x i16> %0, %1 + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsubl_high_u16: +; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32> + %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32> + %sub.i = sub <4 x i32> %0, %1 + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsubl_high_u32: +; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64> + %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64> + %sub.i = sub <2 x i64> %0, %1 + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) { +; CHECK: test_vsubw_s8: +; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = sext <8 x i8> %b to <8 x i16> + %sub.i = sub <8 x i16> %a, %vmovl.i.i + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) { +; CHECK: test_vsubw_s16: +; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = sext <4 x i16> %b to <4 x i32> + %sub.i = sub <4 x i32> %a, %vmovl.i.i + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) { +; CHECK: test_vsubw_s32: +; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = sext <2 x i32> %b to <2 x i64> + %sub.i = sub <2 x i64> %a, %vmovl.i.i + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) { +; CHECK: test_vsubw_u8: +; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b +entry: + %vmovl.i.i = zext <8 x i8> %b to <8 x i16> + %sub.i = sub <8 x i16> %a, %vmovl.i.i + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) { +; CHECK: test_vsubw_u16: +; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h +entry: + %vmovl.i.i = zext <4 x i16> %b to <4 x i32> + %sub.i = sub <4 x i32> %a, %vmovl.i.i + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) { +; CHECK: test_vsubw_u32: +; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s +entry: + %vmovl.i.i = zext <2 x i32> %b to <2 x i64> + %sub.i = sub <2 x i64> %a, %vmovl.i.i + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) { +; CHECK: test_vsubw_high_s8: +; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16> + %sub.i = sub <8 x i16> %a, %0 + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) { +; CHECK: test_vsubw_high_s16: +; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32> + %sub.i = sub <4 x i32> %a, %0 + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) { +; CHECK: test_vsubw_high_s32: +; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64> + %sub.i = sub <2 x i64> %a, %0 + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) { +; CHECK: test_vsubw_high_u8: +; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b +entry: + %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> + %sub.i = sub <8 x i16> %a, %0 + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) { +; CHECK: test_vsubw_high_u16: +; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h +entry: + %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32> + %sub.i = sub <4 x i32> %a, %0 + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) { +; CHECK: test_vsubw_high_u32: +; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s +entry: + %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64> + %sub.i = sub <2 x i64> %a, %0 + ret <2 x i64> %sub.i +} + +define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vaddhn_s16: +; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vaddhn.i = add <8 x i16> %a, %b + %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8> + ret <8 x i8> %vaddhn2.i +} + +define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vaddhn_s32: +; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vaddhn.i = add <4 x i32> %a, %b + %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> + %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16> + ret <4 x i16> %vaddhn2.i +} + +define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vaddhn_s64: +; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vaddhn.i = add <2 x i64> %a, %b + %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> + %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32> + ret <2 x i32> %vaddhn2.i +} + +define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vaddhn_u16: +; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vaddhn.i = add <8 x i16> %a, %b + %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8> + ret <8 x i8> %vaddhn2.i +} + +define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vaddhn_u32: +; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vaddhn.i = add <4 x i32> %a, %b + %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> + %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16> + ret <4 x i16> %vaddhn2.i +} + +define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vaddhn_u64: +; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vaddhn.i = add <2 x i64> %a, %b + %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> + %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32> + ret <2 x i32> %vaddhn2.i +} + +define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vaddhn_high_s16: +; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vaddhn.i.i = add <8 x i16> %a, %b + %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8> + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vaddhn_high_s32: +; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vaddhn.i.i = add <4 x i32> %a, %b + %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> + %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16> + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vaddhn_high_s64: +; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vaddhn.i.i = add <2 x i64> %a, %b + %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> + %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32> + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vaddhn_high_u16: +; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vaddhn.i.i = add <8 x i16> %a, %b + %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8> + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vaddhn_high_u32: +; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vaddhn.i.i = add <4 x i32> %a, %b + %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> + %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16> + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vaddhn_high_u64: +; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vaddhn.i.i = add <2 x i64> %a, %b + %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> + %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32> + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vraddhn_s16: +; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) + ret <8 x i8> %vraddhn2.i +} + +define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vraddhn_s32: +; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) + ret <4 x i16> %vraddhn2.i +} + +define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vraddhn_s64: +; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) + ret <2 x i32> %vraddhn2.i +} + +define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vraddhn_u16: +; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) + ret <8 x i8> %vraddhn2.i +} + +define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vraddhn_u32: +; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) + ret <4 x i16> %vraddhn2.i +} + +define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vraddhn_u64: +; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) + ret <2 x i32> %vraddhn2.i +} + +define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vraddhn_high_s16: +; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vraddhn_high_s32: +; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vraddhn_high_s64: +; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vraddhn_high_u16: +; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vraddhn_high_u32: +; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vraddhn_high_u64: +; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsubhn_s16: +; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vsubhn.i = sub <8 x i16> %a, %b + %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8> + ret <8 x i8> %vsubhn2.i +} + +define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsubhn_s32: +; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vsubhn.i = sub <4 x i32> %a, %b + %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> + %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16> + ret <4 x i16> %vsubhn2.i +} + +define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsubhn_s64: +; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vsubhn.i = sub <2 x i64> %a, %b + %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> + %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32> + ret <2 x i32> %vsubhn2.i +} + +define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsubhn_u16: +; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vsubhn.i = sub <8 x i16> %a, %b + %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8> + ret <8 x i8> %vsubhn2.i +} + +define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsubhn_u32: +; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vsubhn.i = sub <4 x i32> %a, %b + %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> + %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16> + ret <4 x i16> %vsubhn2.i +} + +define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsubhn_u64: +; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vsubhn.i = sub <2 x i64> %a, %b + %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> + %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32> + ret <2 x i32> %vsubhn2.i +} + +define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsubhn_high_s16: +; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vsubhn.i.i = sub <8 x i16> %a, %b + %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8> + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsubhn_high_s32: +; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vsubhn.i.i = sub <4 x i32> %a, %b + %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> + %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16> + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsubhn_high_s64: +; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vsubhn.i.i = sub <2 x i64> %a, %b + %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> + %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32> + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsubhn_high_u16: +; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vsubhn.i.i = sub <8 x i16> %a, %b + %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8> + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsubhn_high_u32: +; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vsubhn.i.i = sub <4 x i32> %a, %b + %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> + %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16> + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsubhn_high_u64: +; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vsubhn.i.i = sub <2 x i64> %a, %b + %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> + %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32> + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vrsubhn_s16: +; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) + ret <8 x i8> %vrsubhn2.i +} + +define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vrsubhn_s32: +; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) + ret <4 x i16> %vrsubhn2.i +} + +define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vrsubhn_s64: +; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) + ret <2 x i32> %vrsubhn2.i +} + +define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vrsubhn_u16: +; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) + ret <8 x i8> %vrsubhn2.i +} + +define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vrsubhn_u32: +; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) + ret <4 x i16> %vrsubhn2.i +} + +define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vrsubhn_u64: +; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) + ret <2 x i32> %vrsubhn2.i +} + +define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vrsubhn_high_s16: +; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vrsubhn_high_s32: +; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vrsubhn_high_s64: +; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vrsubhn_high_u16: +; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) + %0 = bitcast <8 x i8> %r to <1 x i64> + %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vrsubhn_high_u32: +; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) + %0 = bitcast <4 x i16> %r to <1 x i64> + %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16> + ret <8 x i16> %2 +} + +define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vrsubhn_high_u64: +; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) + %0 = bitcast <2 x i32> %r to <1 x i64> + %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64> + %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1> + %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vabdl_s8: +; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) + %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16> + ret <8 x i16> %vmovl.i.i +} + +define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vabdl_s16: +; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) + %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32> + ret <4 x i32> %vmovl.i.i +} + +define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vabdl_s32: +; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) + %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64> + ret <2 x i64> %vmovl.i.i +} + +define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vabdl_u8: +; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) + %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16> + ret <8 x i16> %vmovl.i.i +} + +define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vabdl_u16: +; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) + %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32> + ret <4 x i32> %vmovl.i.i +} + +define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vabdl_u32: +; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) + %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64> + ret <2 x i64> %vmovl.i.i +} + +define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vabal_s8: +; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) + %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> + %add.i = add <8 x i16> %vmovl.i.i.i, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vabal_s16: +; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) + %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32> + %add.i = add <4 x i32> %vmovl.i.i.i, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vabal_s32: +; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) + %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64> + %add.i = add <2 x i64> %vmovl.i.i.i, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vabal_u8: +; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) + %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> + %add.i = add <8 x i16> %vmovl.i.i.i, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vabal_u16: +; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) + %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32> + %add.i = add <4 x i32> %vmovl.i.i.i, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vabal_u32: +; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) + %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64> + %add.i = add <2 x i64> %vmovl.i.i.i, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vabdl_high_s8: +; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> + ret <8 x i16> %vmovl.i.i.i +} + +define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vabdl_high_s16: +; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32> + ret <4 x i32> %vmovl.i.i.i +} + +define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vabdl_high_s32: +; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64> + ret <2 x i64> %vmovl.i.i.i +} + +define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vabdl_high_u8: +; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> + ret <8 x i16> %vmovl.i.i.i +} + +define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vabdl_high_u16: +; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32> + ret <4 x i32> %vmovl.i.i.i +} + +define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vabdl_high_u32: +; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64> + ret <2 x i64> %vmovl.i.i.i +} + +define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vabal_high_s8: +; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16> + %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a + ret <8 x i16> %add.i.i +} + +define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vabal_high_s16: +; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32> + %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vabal_high_s32: +; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64> + %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vabal_high_u8: +; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16> + %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a + ret <8 x i16> %add.i.i +} + +define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vabal_high_u16: +; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32> + %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vabal_high_u32: +; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64> + %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vmull_s8: +; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) + ret <8 x i16> %vmull.i +} + +define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vmull_s16: +; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vmull_s32: +; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b) + ret <2 x i64> %vmull2.i +} + +define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vmull_u8: +; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) + ret <8 x i16> %vmull.i +} + +define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vmull_u16: +; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b) + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vmull_u32: +; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b) + ret <2 x i64> %vmull2.i +} + +define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vmull_high_s8: +; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + ret <8 x i16> %vmull.i.i +} + +define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vmull_high_s16: +; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + ret <4 x i32> %vmull2.i.i +} + +define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vmull_high_s32: +; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + ret <2 x i64> %vmull2.i.i +} + +define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vmull_high_u8: +; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + ret <8 x i16> %vmull.i.i +} + +define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vmull_high_u16: +; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + ret <4 x i32> %vmull2.i.i +} + +define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vmull_high_u32: +; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + ret <2 x i64> %vmull2.i.i +} + +define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vmlal_s8: +; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) + %add.i = add <8 x i16> %vmull.i.i, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vmlal_s16: +; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) + %add.i = add <4 x i32> %vmull2.i.i, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vmlal_s32: +; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) + %add.i = add <2 x i64> %vmull2.i.i, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vmlal_u8: +; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) + %add.i = add <8 x i16> %vmull.i.i, %a + ret <8 x i16> %add.i +} + +define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vmlal_u16: +; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) + %add.i = add <4 x i32> %vmull2.i.i, %a + ret <4 x i32> %add.i +} + +define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vmlal_u32: +; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) + %add.i = add <2 x i64> %vmull2.i.i, %a + ret <2 x i64> %add.i +} + +define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vmlal_high_s8: +; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %add.i.i = add <8 x i16> %vmull.i.i.i, %a + ret <8 x i16> %add.i.i +} + +define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vmlal_high_s16: +; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vmlal_high_s32: +; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vmlal_high_u8: +; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %add.i.i = add <8 x i16> %vmull.i.i.i, %a + ret <8 x i16> %add.i.i +} + +define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vmlal_high_u16: +; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vmlal_high_u32: +; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vmlsl_s8: +; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) + %sub.i = sub <8 x i16> %a, %vmull.i.i + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vmlsl_s16: +; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) + %sub.i = sub <4 x i32> %a, %vmull2.i.i + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vmlsl_s32: +; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) + %sub.i = sub <2 x i64> %a, %vmull2.i.i + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vmlsl_u8: +; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) + %sub.i = sub <8 x i16> %a, %vmull.i.i + ret <8 x i16> %sub.i +} + +define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vmlsl_u16: +; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) + %sub.i = sub <4 x i32> %a, %vmull2.i.i + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vmlsl_u32: +; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) + %sub.i = sub <2 x i64> %a, %vmull2.i.i + ret <2 x i64> %sub.i +} + +define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vmlsl_high_s8: +; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i + ret <8 x i16> %sub.i.i +} + +define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vmlsl_high_s16: +; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vmlsl_high_s32: +; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vmlsl_high_u8: +; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i + ret <8 x i16> %sub.i.i +} + +define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vmlsl_high_u16: +; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vmlsl_high_u32: +; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vqdmull_s16: +; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vqdmull_s32: +; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vqdmlal_s16: +; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) + %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i) + ret <4 x i32> %vqdmlal4.i +} + +define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vqdmlal_s32: +; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) + %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i) + ret <2 x i64> %vqdmlal4.i +} + +define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) { +; CHECK: test_vqdmlsl_s16: +; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) + %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i) + ret <4 x i32> %vqdmlsl4.i +} + +define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) { +; CHECK: test_vqdmlsl_s32: +; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) + %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i) + ret <2 x i64> %vqdmlsl4.i +} + +define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vqdmull_high_s16: +; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + ret <4 x i32> %vqdmull2.i.i +} + +define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vqdmull_high_s32: +; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + ret <2 x i64> %vqdmull2.i.i +} + +define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vqdmlal_high_s16: +; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i) + ret <4 x i32> %vqdmlal4.i.i +} + +define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vqdmlal_high_s32: +; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i) + ret <2 x i64> %vqdmlal4.i.i +} + +define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) { +; CHECK: test_vqdmlsl_high_s16: +; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) + %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i) + ret <4 x i32> %vqdmlsl4.i.i +} + +define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK: test_vqdmlsl_high_s32: +; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i) + %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i) + ret <2 x i64> %vqdmlsl4.i.i +} + +define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vmull_p8: +; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) + ret <8 x i16> %vmull.i +} + +define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vmull_high_p8: +; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) + ret <8 x i16> %vmull.i.i +} + diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll new file mode 100644 index 0000000000000..54009849ef601 --- /dev/null +++ b/test/CodeGen/AArch64/neon-aba-abd.ll @@ -0,0 +1,236 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uabd_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uabd v0.8b, v0.8b, v1.8b + ret <8 x i8> %abd +} + +define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uaba_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) + %aba = add <8 x i8> %lhs, %abd +; CHECK: uaba v0.8b, v0.8b, v1.8b + ret <8 x i8> %aba +} + +define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sabd_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sabd v0.8b, v0.8b, v1.8b + ret <8 x i8> %abd +} + +define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_saba_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) + %aba = add <8 x i8> %lhs, %abd +; CHECK: saba v0.8b, v0.8b, v1.8b + ret <8 x i8> %aba +} + +declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uabd_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uabd v0.16b, v0.16b, v1.16b + ret <16 x i8> %abd +} + +define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uaba_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) + %aba = add <16 x i8> %lhs, %abd +; CHECK: uaba v0.16b, v0.16b, v1.16b + ret <16 x i8> %aba +} + +define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sabd_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sabd v0.16b, v0.16b, v1.16b + ret <16 x i8> %abd +} + +define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_saba_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) + %aba = add <16 x i8> %lhs, %abd +; CHECK: saba v0.16b, v0.16b, v1.16b + ret <16 x i8> %aba +} + +declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uabd_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uabd v0.4h, v0.4h, v1.4h + ret <4 x i16> %abd +} + +define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uaba_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %aba = add <4 x i16> %lhs, %abd +; CHECK: uaba v0.4h, v0.4h, v1.4h + ret <4 x i16> %aba +} + +define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sabd_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sabd v0.4h, v0.4h, v1.4h + ret <4 x i16> %abd +} + +define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_saba_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %aba = add <4 x i16> %lhs, %abd +; CHECK: saba v0.4h, v0.4h, v1.4h + ret <4 x i16> %aba +} + +declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uabd_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uabd v0.8h, v0.8h, v1.8h + ret <8 x i16> %abd +} + +define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uaba_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + %aba = add <8 x i16> %lhs, %abd +; CHECK: uaba v0.8h, v0.8h, v1.8h + ret <8 x i16> %aba +} + +define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sabd_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sabd v0.8h, v0.8h, v1.8h + ret <8 x i16> %abd +} + +define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_saba_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + %aba = add <8 x i16> %lhs, %abd +; CHECK: saba v0.8h, v0.8h, v1.8h + ret <8 x i16> %aba +} + +declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uabd_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uabd v0.2s, v0.2s, v1.2s + ret <2 x i32> %abd +} + +define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uaba_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + %aba = add <2 x i32> %lhs, %abd +; CHECK: uaba v0.2s, v0.2s, v1.2s + ret <2 x i32> %aba +} + +define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sabd_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sabd v0.2s, v0.2s, v1.2s + ret <2 x i32> %abd +} + +define <2 x i32> @test_sabd_v2i32_const() { +; CHECK: test_sabd_v2i32_const: +; CHECK: movi d1, #0xffffffff0000 +; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s + %1 = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32( + <2 x i32> <i32 -2147483648, i32 2147450880>, + <2 x i32> <i32 -65536, i32 65535>) + ret <2 x i32> %1 +} + +define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_saba_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + %aba = add <2 x i32> %lhs, %abd +; CHECK: saba v0.2s, v0.2s, v1.2s + ret <2 x i32> %aba +} + +declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uabd_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uabd v0.4s, v0.4s, v1.4s + ret <4 x i32> %abd +} + +define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uaba_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + %aba = add <4 x i32> %lhs, %abd +; CHECK: uaba v0.4s, v0.4s, v1.4s + ret <4 x i32> %aba +} + +define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sabd_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sabd v0.4s, v0.4s, v1.4s + ret <4 x i32> %abd +} + +define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_saba_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + %aba = add <4 x i32> %lhs, %abd +; CHECK: saba v0.4s, v0.4s, v1.4s + ret <4 x i32> %aba +} + +declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>) + +define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fabd_v2f32: + %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fabd v0.2s, v0.2s, v1.2s + ret <2 x float> %abd +} + +declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) + +define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fabd_v4f32: + %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fabd v0.4s, v0.4s, v1.4s + ret <4 x float> %abd +} + +declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>) + +define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fabd_v2f64: + %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fabd v0.2d, v0.2d, v1.2d + ret <2 x double> %abd +} diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll new file mode 100644 index 0000000000000..733db970cf33d --- /dev/null +++ b/test/CodeGen/AArch64/neon-across.ll @@ -0,0 +1,476 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float>) + +declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float>) + +declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float>) + +declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float>) + +declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>) + +declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8>) + +declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8>) + +declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32>) + +declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8>) + +declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32>) + +declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8>) + +declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8>) + +declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8>) + +declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32>) + +declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8>) + +declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32>) + +declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8>) + +declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8>) + +declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16>) + +declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8>) + +declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32>) + +declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16>) + +declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8>) + +declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32>) + +declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16>) + +declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8>) + +declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16>) + +declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8>) + +declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16>) + +declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8>) + +define i16 @test_vaddlv_s8(<8 x i8> %a) { +; CHECK: test_vaddlv_s8: +; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i16> %saddlv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vaddlv_s16(<4 x i16> %a) { +; CHECK: test_vaddlv_s16: +; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i32> %saddlv.i, i32 0 + ret i32 %0 +} + +define i16 @test_vaddlv_u8(<8 x i8> %a) { +; CHECK: test_vaddlv_u8: +; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i16> %uaddlv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vaddlv_u16(<4 x i16> %a) { +; CHECK: test_vaddlv_u16: +; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i32> %uaddlv.i, i32 0 + ret i32 %0 +} + +define i16 @test_vaddlvq_s8(<16 x i8> %a) { +; CHECK: test_vaddlvq_s8: +; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i16> %saddlv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vaddlvq_s16(<8 x i16> %a) { +; CHECK: test_vaddlvq_s16: +; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i32> %saddlv.i, i32 0 + ret i32 %0 +} + +define i64 @test_vaddlvq_s32(<4 x i32> %a) { +; CHECK: test_vaddlvq_s32: +; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %saddlv.i = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i64> %saddlv.i, i32 0 + ret i64 %0 +} + +define i16 @test_vaddlvq_u8(<16 x i8> %a) { +; CHECK: test_vaddlvq_u8: +; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i16> %uaddlv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vaddlvq_u16(<8 x i16> %a) { +; CHECK: test_vaddlvq_u16: +; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i32> %uaddlv.i, i32 0 + ret i32 %0 +} + +define i64 @test_vaddlvq_u32(<4 x i32> %a) { +; CHECK: test_vaddlvq_u32: +; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %uaddlv.i = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i64> %uaddlv.i, i32 0 + ret i64 %0 +} + +define i8 @test_vmaxv_s8(<8 x i8> %a) { +; CHECK: test_vmaxv_s8: +; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i8> %smaxv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vmaxv_s16(<4 x i16> %a) { +; CHECK: test_vmaxv_s16: +; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i16> %smaxv.i, i32 0 + ret i16 %0 +} + +define i8 @test_vmaxv_u8(<8 x i8> %a) { +; CHECK: test_vmaxv_u8: +; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i8> %umaxv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vmaxv_u16(<4 x i16> %a) { +; CHECK: test_vmaxv_u16: +; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i16> %umaxv.i, i32 0 + ret i16 %0 +} + +define i8 @test_vmaxvq_s8(<16 x i8> %a) { +; CHECK: test_vmaxvq_s8: +; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i8> %smaxv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vmaxvq_s16(<8 x i16> %a) { +; CHECK: test_vmaxvq_s16: +; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i16> %smaxv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vmaxvq_s32(<4 x i32> %a) { +; CHECK: test_vmaxvq_s32: +; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %smaxv.i = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i32> %smaxv.i, i32 0 + ret i32 %0 +} + +define i8 @test_vmaxvq_u8(<16 x i8> %a) { +; CHECK: test_vmaxvq_u8: +; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i8> %umaxv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vmaxvq_u16(<8 x i16> %a) { +; CHECK: test_vmaxvq_u16: +; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i16> %umaxv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vmaxvq_u32(<4 x i32> %a) { +; CHECK: test_vmaxvq_u32: +; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %umaxv.i = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i32> %umaxv.i, i32 0 + ret i32 %0 +} + +define i8 @test_vminv_s8(<8 x i8> %a) { +; CHECK: test_vminv_s8: +; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i8> %sminv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vminv_s16(<4 x i16> %a) { +; CHECK: test_vminv_s16: +; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i16> %sminv.i, i32 0 + ret i16 %0 +} + +define i8 @test_vminv_u8(<8 x i8> %a) { +; CHECK: test_vminv_u8: +; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i8> %uminv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vminv_u16(<4 x i16> %a) { +; CHECK: test_vminv_u16: +; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i16> %uminv.i, i32 0 + ret i16 %0 +} + +define i8 @test_vminvq_s8(<16 x i8> %a) { +; CHECK: test_vminvq_s8: +; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i8> %sminv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vminvq_s16(<8 x i16> %a) { +; CHECK: test_vminvq_s16: +; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i16> %sminv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vminvq_s32(<4 x i32> %a) { +; CHECK: test_vminvq_s32: +; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %sminv.i = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i32> %sminv.i, i32 0 + ret i32 %0 +} + +define i8 @test_vminvq_u8(<16 x i8> %a) { +; CHECK: test_vminvq_u8: +; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i8> %uminv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vminvq_u16(<8 x i16> %a) { +; CHECK: test_vminvq_u16: +; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i16> %uminv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vminvq_u32(<4 x i32> %a) { +; CHECK: test_vminvq_u32: +; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %uminv.i = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i32> %uminv.i, i32 0 + ret i32 %0 +} + +define i8 @test_vaddv_s8(<8 x i8> %a) { +; CHECK: test_vaddv_s8: +; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i8> %vaddv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vaddv_s16(<4 x i16> %a) { +; CHECK: test_vaddv_s16: +; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i16> %vaddv.i, i32 0 + ret i16 %0 +} + +define i8 @test_vaddv_u8(<8 x i8> %a) { +; CHECK: test_vaddv_u8: +; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b +entry: + %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a) + %0 = extractelement <1 x i8> %vaddv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vaddv_u16(<4 x i16> %a) { +; CHECK: test_vaddv_u16: +; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h +entry: + %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a) + %0 = extractelement <1 x i16> %vaddv.i, i32 0 + ret i16 %0 +} + +define i8 @test_vaddvq_s8(<16 x i8> %a) { +; CHECK: test_vaddvq_s8: +; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i8> %vaddv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vaddvq_s16(<8 x i16> %a) { +; CHECK: test_vaddvq_s16: +; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i16> %vaddv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vaddvq_s32(<4 x i32> %a) { +; CHECK: test_vaddvq_s32: +; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i32> %vaddv.i, i32 0 + ret i32 %0 +} + +define i8 @test_vaddvq_u8(<16 x i8> %a) { +; CHECK: test_vaddvq_u8: +; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b +entry: + %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a) + %0 = extractelement <1 x i8> %vaddv.i, i32 0 + ret i8 %0 +} + +define i16 @test_vaddvq_u16(<8 x i16> %a) { +; CHECK: test_vaddvq_u16: +; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h +entry: + %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a) + %0 = extractelement <1 x i16> %vaddv.i, i32 0 + ret i16 %0 +} + +define i32 @test_vaddvq_u32(<4 x i32> %a) { +; CHECK: test_vaddvq_u32: +; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a) + %0 = extractelement <1 x i32> %vaddv.i, i32 0 + ret i32 %0 +} + +define float @test_vmaxvq_f32(<4 x float> %a) { +; CHECK: test_vmaxvq_f32: +; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %vmaxv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float> %a) + %0 = extractelement <1 x float> %vmaxv.i, i32 0 + ret float %0 +} + +define float @test_vminvq_f32(<4 x float> %a) { +; CHECK: test_vminvq_f32: +; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %vminv.i = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float> %a) + %0 = extractelement <1 x float> %vminv.i, i32 0 + ret float %0 +} + +define float @test_vmaxnmvq_f32(<4 x float> %a) { +; CHECK: test_vmaxnmvq_f32: +; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %vmaxnmv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float> %a) + %0 = extractelement <1 x float> %vmaxnmv.i, i32 0 + ret float %0 +} + +define float @test_vminnmvq_f32(<4 x float> %a) { +; CHECK: test_vminnmvq_f32: +; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s +entry: + %vminnmv.i = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float> %a) + %0 = extractelement <1 x float> %vminnmv.i, i32 0 + ret float %0 +} + diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll new file mode 100644 index 0000000000000..1abfed31908cd --- /dev/null +++ b/test/CodeGen/AArch64/neon-add-pairwise.ll @@ -0,0 +1,92 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_addp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: addp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_addp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: addp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_addp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: addp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_addp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: addp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_addp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: addp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_addp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: addp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + +declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_addp_v2i64: + %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: addp v0.2d, v0.2d, v1.2d + ret <2 x i64> %val +} + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_faddp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: faddp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_faddp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: faddp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_faddp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: faddp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll new file mode 100644 index 0000000000000..078ba14bd87af --- /dev/null +++ b/test/CodeGen/AArch64/neon-add-sub.ll @@ -0,0 +1,237 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: add {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = add <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: add {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = add <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: add {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = add <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: add {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = add <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: add {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = add <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: add {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = add <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = add <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fadd {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fadd <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fadd {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fadd <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fadd <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: sub {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = sub <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: sub {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = sub <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: sub {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = sub <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: sub {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = sub <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: sub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = sub <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: sub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = sub <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = sub <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fsub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fsub <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fsub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fsub <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fsub <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vadd_f64 +; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fadd <1 x double> %a, %b + ret <1 x double> %1 +} + +define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vmul_f64 +; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fmul <1 x double> %a, %b + ret <1 x double> %1 +} + +define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vdiv_f64 +; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fdiv <1 x double> %a, %b + ret <1 x double> %1 +} + +define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) { +; CHECK-LABEL: test_vmla_f64 +; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fmul <1 x double> %b, %c + %2 = fadd <1 x double> %1, %a + ret <1 x double> %2 +} + +define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) { +; CHECK-LABEL: test_vmls_f64 +; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fmul <1 x double> %b, %c + %2 = fsub <1 x double> %a, %1 + ret <1 x double> %2 +} + +define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) { +; CHECK-LABEL: test_vfms_f64 +; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fsub <1 x double> <double -0.000000e+00>, %b + %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a) + ret <1 x double> %2 +} + +define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) { +; CHECK-LABEL: test_vfma_f64 +; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vsub_f64 +; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = fsub <1 x double> %a, %b + ret <1 x double> %1 +} + +define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vabd_f64 +; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %1 +} + +define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vmax_f64 +; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %1 +} + +define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vmin_f64 +; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %1 +} + +define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vmaxnm_f64 +; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %1 +} + +define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vminnm_f64 +; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %1 +} + +define <1 x double> @test_vabs_f64(<1 x double> %a) { +; CHECK-LABEL: test_vabs_f64 +; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vneg_f64(<1 x double> %a) { +; CHECK-LABEL: test_vneg_f64 +; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}} + %1 = fsub <1 x double> <double -0.000000e+00>, %a + ret <1 x double> %1 +} + +declare <1 x double> @llvm.fabs.v1f64(<1 x double>) +declare <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double>, <1 x double>) +declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>) +declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>) +declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>) +declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>) +declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll new file mode 100644 index 0000000000000..f9ec704840247 --- /dev/null +++ b/test/CodeGen/AArch64/neon-bitcast.ll @@ -0,0 +1,574 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s + +; From <8 x i8> + +define <1 x i64> @test_v8i8_to_v1i64(<8 x i8> %in) nounwind { +; CHECK: test_v8i8_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v8i8_to_v2i32(<8 x i8> %in) nounwind { +; CHECK: test_v8i8_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v8i8_to_v1f32(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v8i8_to_v4i16(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v8i8_to_v8i8(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <4 x i16> + +define <1 x i64> @test_v4i16_to_v1i64(<4 x i16> %in) nounwind { +; CHECK: test_v4i16_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v4i16_to_v2i32(<4 x i16> %in) nounwind { +; CHECK: test_v4i16_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v4i16_to_v1f32(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v4i16_to_v4i16(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v4i16_to_v8i8(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <2 x i32> + +define <1 x i64> @test_v2i32_to_v1i64(<2 x i32> %in) nounwind { +; CHECK: test_v2i32_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v2i32_to_v2i32(<2 x i32> %in) nounwind { +; CHECK: test_v2i32_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v2i32_to_v1f32(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v2i32_to_v4i16(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v2i32_to_v8i8(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <2 x float> + +define <1 x i64> @test_v2f32_to_v1i64(<2 x float> %in) nounwind { +; CHECK: test_v2f32_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v2f32_to_v2i32(<2 x float> %in) nounwind { +; CHECK: test_v2f32_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v2f32_to_v2f32(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v2f32_to_v4i16(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v2f32_to_v8i8(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <1 x i64> + +define <1 x i64> @test_v1i64_to_v1i64(<1 x i64> %in) nounwind { +; CHECK: test_v1i64_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v1i64_to_v2i32(<1 x i64> %in) nounwind { +; CHECK: test_v1i64_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v1i64_to_v2f32(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v1i64_to_v4i16(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v1i64_to_v8i8(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <8 x i8> + ret <8 x i8> %val +} + + +; From <16 x i8> + +define <2 x double> @test_v16i8_to_v2f64(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v16i8_to_v2i64(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v16i8_to_v4i32(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v16i8_to_v2f32(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v16i8_to_v8i16(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v16i8_to_v16i8(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <8 x i16> + +define <2 x double> @test_v8i16_to_v2f64(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v8i16_to_v2i64(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v8i16_to_v4i32(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v8i16_to_v2f32(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v8i16_to_v8i16(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v8i16_to_v16i8(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <4 x i32> + +define <2 x double> @test_v4i32_to_v2f64(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v4i32_to_v2i64(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v4i32_to_v4i32(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v4i32_to_v2f32(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v4i32_to_v8i16(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v4i32_to_v16i8(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <4 x float> + +define <2 x double> @test_v4f32_to_v2f64(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v4f32_to_v2i64(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v4f32_to_v4i32(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v4f32_to_v4f32(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v4f32_to_v8i16(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v4f32_to_v16i8(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <2 x i64> + +define <2 x double> @test_v2i64_to_v2f64(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v2i64_to_v2i64(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v2i64_to_v4i32(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v2i64_to_v4f32(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v2i64_to_v8i16(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v2i64_to_v16i8(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <2 x double> + +define <2 x double> @test_v2f64_to_v2f64(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v2f64_to_v2i64(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v2f64_to_v4i32(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v2f64_to_v4f32(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v2f64_to_v8i16(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v2f64_to_v16i8(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <16 x i8> + ret <16 x i8> %val +} + diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll new file mode 100644 index 0000000000000..1c43b979fc449 --- /dev/null +++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -0,0 +1,594 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + + +define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + + +define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + +define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = or <8 x i8> %a, %tmp1 + ret <8 x i8> %tmp2 +} + +define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = or <16 x i8> %a, %tmp1 + ret <16 x i8> %tmp2 +} + +define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <8 x i8> %a, %tmp1 + ret <8 x i8> %tmp2 +} + +define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <16 x i8> %a, %tmp1 + ret <16 x i8> %tmp2 +} + +define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff + %tmp1 = or <2 x i32> %a, < i32 255, i32 255> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #8 + %tmp1 = or <2 x i32> %a, < i32 65280, i32 65280> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #16 + %tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #24 + %tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080> + ret <2 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff + %tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #8 + %tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #16 + %tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #24 + %tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080> + ret <4 x i32> %tmp1 +} + +define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff + %tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 + %tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > + ret <4 x i16> %tmp1 +} + +define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff + %tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 + %tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > + ret <8 x i16> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10 + %tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #8 + %tmp1 = and <2 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #16 + %tmp1 = and <2 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #24 + %tmp1 = and <2 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159> + ret <2 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10 + %tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #8 + %tmp1 = and <4 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #16 + %tmp1 = and <4 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #24 + %tmp1 = and <4 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159> + ret <4 x i32> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x10 + %tmp1 = and <4 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x0 + %tmp1 = and <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x10, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x0, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255> + ret <4 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x10 + %tmp1 = and <8 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, + i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x0 + %tmp1 = and <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x10, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, + i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x0, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + ret <8 x i16> %tmp1 +} + +define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + +define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + +define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + + +define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > + %tmp2 = and <2 x i32> %a, %tmp1 + ret <2 x i32> %tmp2 +} + +define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > + %tmp2 = and <4 x i16> %a, %tmp1 + ret <4 x i16> %tmp2 +} + +define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %b, < i64 -1> + %tmp2 = and <1 x i64> %a, %tmp1 + ret <1 x i64> %tmp2 +} + +define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> + %tmp2 = and <4 x i32> %a, %tmp1 + ret <4 x i32> %tmp2 +} + +define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp2 = and <8 x i16> %a, %tmp1 + ret <8 x i16> %tmp2 +} + +define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> + %tmp2 = and <2 x i64> %a, %tmp1 + ret <2 x i64> %tmp2 +} + +define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > + %tmp2 = or <2 x i32> %a, %tmp1 + ret <2 x i32> %tmp2 +} + +define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > + %tmp2 = or <4 x i16> %a, %tmp1 + ret <4 x i16> %tmp2 +} + +define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %b, < i64 -1> + %tmp2 = or <1 x i64> %a, %tmp1 + ret <1 x i64> %tmp2 +} + +define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> + %tmp2 = or <4 x i32> %a, %tmp1 + ret <4 x i32> %tmp2 +} + +define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp2 = or <8 x i16> %a, %tmp1 + ret <8 x i16> %tmp2 +} + +define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> + %tmp2 = or <2 x i64> %a, %tmp1 + ret <2 x i64> %tmp2 +} +define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 > + %tmp2 = and <2 x i32> %b, < i32 0, i32 0 > + %tmp3 = or <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + + +define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 > + %tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 > + %tmp3 = or <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <1 x i64> %a, < i64 -1 > + %tmp2 = and <1 x i64> %b, < i64 0 > + %tmp3 = or <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 > + %tmp3 = or <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 > + %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 > + %tmp3 = or <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 > + %tmp2 = and <2 x i64> %b, < i64 0, i64 0 > + %tmp3 = or <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + + +define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <8 x i8> %v1, %v2 + %2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %3 = and <8 x i8> %2, %v3 + %4 = or <8 x i8> %1, %3 + ret <8 x i8> %4 +} + +define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <4 x i16> %v1, %v2 + %2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1> + %3 = and <4 x i16> %2, %v3 + %4 = or <4 x i16> %1, %3 + ret <4 x i16> %4 +} + +define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <2 x i32> %v1, %v2 + %2 = xor <2 x i32> %v1, <i32 -1, i32 -1> + %3 = and <2 x i32> %2, %v3 + %4 = or <2 x i32> %1, %3 + ret <2 x i32> %4 +} + +define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <1 x i64> %v1, %v2 + %2 = xor <1 x i64> %v1, <i64 -1> + %3 = and <1 x i64> %2, %v3 + %4 = or <1 x i64> %1, %3 + ret <1 x i64> %4 +} + +define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <16 x i8> %v1, %v2 + %2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %3 = and <16 x i8> %2, %v3 + %4 = or <16 x i8> %1, %3 + ret <16 x i8> %4 +} + +define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <8 x i16> %v1, %v2 + %2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %3 = and <8 x i16> %2, %v3 + %4 = or <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <4 x i32> %v1, %v2 + %2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1> + %3 = and <4 x i32> %2, %v3 + %4 = or <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <2 x i64> %v1, %v2 + %2 = xor <2 x i64> %v1, <i64 -1, i64 -1> + %3 = and <2 x i64> %2, %v3 + %4 = or <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff + %val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0> + ret <8 x i8> %val +} + +define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 + %val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255> + ret <8 x i8> %val +} + +define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff + %val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0> + ret <16 x i8> %val +} + +define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 + %val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255> + ret <16 x i8> %val +} + + diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll new file mode 100644 index 0000000000000..6bd923dc2ccae --- /dev/null +++ b/test/CodeGen/AArch64/neon-bsl.ll @@ -0,0 +1,222 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>) + +declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) + +declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>) + +declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) + +declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) + +declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) + +declare <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double>, <1 x double>, <1 x double>) + +declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>) + +declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) + +declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) + +define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { +; CHECK-LABEL: test_vbsl_s8: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { +; CHECK-LABEL: test_vbsl_s16: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) + %0 = bitcast <4 x i16> %vbsl3.i to <8 x i8> + ret <8 x i8> %0 +} + +define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { +; CHECK-LABEL: test_vbsl_s32: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) + ret <2 x i32> %vbsl3.i +} + +define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { +; CHECK-LABEL: test_vbsl_s64: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) + ret <1 x i64> %vbsl3.i +} + +define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { +; CHECK-LABEL: test_vbsl_u8: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) + ret <8 x i8> %vbsl.i +} + +define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { +; CHECK-LABEL: test_vbsl_u16: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) + ret <4 x i16> %vbsl3.i +} + +define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { +; CHECK-LABEL: test_vbsl_u32: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) + ret <2 x i32> %vbsl3.i +} + +define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { +; CHECK-LABEL: test_vbsl_u64: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) + ret <1 x i64> %vbsl3.i +} + +define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) { +; CHECK-LABEL: test_vbsl_f32: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) + ret <2 x float> %vbsl3.i +} + +define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) { +; CHECK-LABEL: test_vbsl_f64: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl.i = bitcast <1 x i64> %v1 to <1 x double> + %vbsl3.i = tail call <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double> %vbsl.i, <1 x double> %v2, <1 x double> %v3) + ret <1 x double> %vbsl3.i +} + +define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { +; CHECK-LABEL: test_vbsl_p8: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) + ret <8 x i8> %vbsl.i +} + +define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { +; CHECK-LABEL: test_vbsl_p16: +; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) + ret <4 x i16> %vbsl3.i +} + +define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { +; CHECK-LABEL: test_vbslq_s8: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) + ret <16 x i8> %vbsl.i +} + +define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { +; CHECK-LABEL: test_vbslq_s16: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) + ret <8 x i16> %vbsl3.i +} + +define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: test_vbslq_s32: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) + ret <4 x i32> %vbsl3.i +} + +define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) { +; CHECK-LABEL: test_vbslq_s64: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) + ret <2 x i64> %vbsl3.i +} + +define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { +; CHECK-LABEL: test_vbslq_u8: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) + ret <16 x i8> %vbsl.i +} + +define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { +; CHECK-LABEL: test_vbslq_u16: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) + ret <8 x i16> %vbsl3.i +} + +define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: test_vbslq_u32: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) + ret <4 x i32> %vbsl3.i +} + +define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) { +; CHECK-LABEL: test_vbslq_u64: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) + ret <2 x i64> %vbsl3.i +} + +define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) { +; CHECK-LABEL: test_vbslq_f32: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl.i = bitcast <4 x i32> %v1 to <4 x float> + %vbsl3.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %vbsl.i, <4 x float> %v2, <4 x float> %v3) + ret <4 x float> %vbsl3.i +} + +define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { +; CHECK-LABEL: test_vbslq_p8: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) + ret <16 x i8> %vbsl.i +} + +define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { +; CHECK-LABEL: test_vbslq_p16: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) + ret <8 x i16> %vbsl3.i +} + +define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) { +; CHECK-LABEL: test_vbslq_f64: +; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vbsl.i = bitcast <2 x i64> %v1 to <2 x double> + %vbsl3.i = tail call <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double> %vbsl.i, <2 x double> %v2, <2 x double> %v3) + ret <2 x double> %vbsl3.i +} + diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll new file mode 100644 index 0000000000000..68f03425b2765 --- /dev/null +++ b/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -0,0 +1,1926 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp eq <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp eq <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp eq <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp eq <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp eq <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp eq <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp eq <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp sgt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp sgt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp sgt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp sgt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp sgt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp sgt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp sgt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp slt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp slt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp slt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp slt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp slt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp slt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp slt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp sge <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp sge <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp sge <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp sge <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp sge <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp sge <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp sge <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp sle <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp sle <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp sle <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp sle <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp sle <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp sle <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp sle <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ugt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ugt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp ugt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp ugt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp ugt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp ugt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp ugt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ult <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ult <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ult <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ult <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ult <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ult <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ult <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp uge <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp uge <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp uge <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp uge <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp uge <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp uge <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp uge <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ule <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ule <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ule <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ule <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ule <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ule <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ule <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = and <8 x i8> %A, %B + %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i8> + ret <8 x i8> %tmp5 +} + +define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = and <16 x i8> %A, %B + %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer + %tmp5 = sext <16 x i1> %tmp4 to <16 x i8> + ret <16 x i8> %tmp5 +} + +define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = and <4 x i16> %A, %B + %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i16> + ret <4 x i16> %tmp5 +} + +define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = and <8 x i16> %A, %B + %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i16> + ret <8 x i16> %tmp5 +} + +define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = and <2 x i32> %A, %B + %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer + %tmp5 = sext <2 x i1> %tmp4 to <2 x i32> + ret <2 x i32> %tmp5 +} + +define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = and <4 x i32> %A, %B + %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i32> + ret <4 x i32> %tmp5 +} + +define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = and <2 x i64> %A, %B + %tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer + %tmp5 = sext <2 x i1> %tmp4 to <2 x i64> + ret <2 x i64> %tmp5 +} + + + +define <8 x i8> @cmeqz8xi8(<8 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp eq <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmeqz16xi8(<16 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp eq <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmeqz4xi16(<4 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp eq <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmeqz8xi16(<8 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp eq <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmeqz2xi32(<2 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp eq <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmeqz4xi32(<4 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp eq <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmeqz2xi64(<2 x i64> %A) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp eq <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmgez8xi8(<8 x i8> %A) { +;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sge <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgez16xi8(<16 x i8> %A) { +;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sge <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgez4xi16(<4 x i16> %A) { +;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sge <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgez8xi16(<8 x i16> %A) { +;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sge <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgez2xi32(<2 x i32> %A) { +;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sge <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgez4xi32(<4 x i32> %A) { +;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sge <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgez2xi64(<2 x i64> %A) { +;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sge <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmgtz8xi8(<8 x i8> %A) { +;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sgt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgtz16xi8(<16 x i8> %A) { +;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sgt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgtz4xi16(<4 x i16> %A) { +;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sgt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgtz8xi16(<8 x i16> %A) { +;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sgt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgtz2xi32(<2 x i32> %A) { +;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sgt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgtz4xi32(<4 x i32> %A) { +;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sgt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgtz2xi64(<2 x i64> %A) { +;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sgt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlez8xi8(<8 x i8> %A) { +;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sle <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlez16xi8(<16 x i8> %A) { +;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sle <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlez4xi16(<4 x i16> %A) { +;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sle <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlez8xi16(<8 x i16> %A) { +;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sle <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlez2xi32(<2 x i32> %A) { +;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sle <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlez4xi32(<4 x i32> %A) { +;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sle <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlez2xi64(<2 x i64> %A) { +;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sle <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmltz8xi8(<8 x i8> %A) { +;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp slt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmltz16xi8(<16 x i8> %A) { +;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp slt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmltz4xi16(<4 x i16> %A) { +;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp slt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmltz8xi16(<8 x i16> %A) { +;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp slt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmltz2xi32(<2 x i32> %A) { +;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp slt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmltz4xi32(<4 x i32> %A) { +;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp slt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmltz2xi64(<2 x i64> %A) { +;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp slt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmneqz8xi8(<8 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmneqz16xi8(<16 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmneqz4xi16(<4 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmneqz8xi16(<8 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmneqz2xi32(<2 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmneqz4xi32(<4 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmneqz2xi64(<2 x i64> %A) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhsz8xi8(<8 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp uge <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhsz16xi8(<16 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp uge <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhsz4xi16(<4 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp uge <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhsz8xi16(<8 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp uge <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhsz2xi32(<2 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp uge <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhsz4xi32(<4 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp uge <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhsz2xi64(<2 x i64> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp uge <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmhiz8xi8(<8 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ugt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhiz16xi8(<16 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ugt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhiz4xi16(<4 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp ugt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhiz8xi16(<8 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp ugt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhiz2xi32(<2 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp ugt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhiz4xi32(<4 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp ugt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhiz2xi64(<2 x i64> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp ugt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlsz8xi8(<8 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ule <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlsz16xi8(<16 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ule <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlsz4xi16(<4 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ule <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlsz8xi16(<8 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ule <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlsz2xi32(<2 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ule <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlsz4xi32(<4 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ule <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlsz2xi64(<2 x i64> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ule <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmloz8xi8(<8 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ult <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmloz16xi8(<16 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ult <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmloz4xi16(<4 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ult <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmloz8xi16(<8 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ult <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmloz2xi32(<2 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ult <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmloz4xi32(<4 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ult <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmloz2xi64(<2 x i64> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ult <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp oeq <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp oeq <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp oeq <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp oge <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp oge <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp oge <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp ogt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp ogt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp ogt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = fcmp ole <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = fcmp ole <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = fcmp ole <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = fcmp olt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = fcmp olt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = fcmp olt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp one <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; todo check reversed operands + %tmp3 = fcmp one <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ord <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + + +define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uno <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ueq <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uge <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGT = ULT with swapped operands, ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ugt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGT = ULT with swapped operands, ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ule <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ult <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp une <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) { +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp oeq <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) { +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp oeq <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) { +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp oeq <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmogez2xfloat(<2 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp oge <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogez4xfloat(<4 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp oge <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogez2xdouble(<2 x double> %A) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp oge <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) { +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp ogt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) { +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp ogt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) { +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp ogt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) { +;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp olt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) { +;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp olt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) { +;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp olt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmolez2xfloat(<2 x float> %A) { +;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp ole <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmolez4xfloat(<4 x float> %A) { +;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp ole <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmolez2xdouble(<2 x double> %A) { +;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp ole <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmonez2xfloat(<2 x float> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp one <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmonez4xfloat(<4 x float> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmonez2xdouble(<2 x double> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmordz2xfloat(<2 x float> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ord <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmordz4xfloat(<4 x float> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmordz2xdouble(<2 x double> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugez2xfloat(<2 x float> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uge <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugez4xfloat(<4 x float> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugez2xdouble(<2 x double> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmultz2xfloat(<2 x float> %A) { +; ULT with zero = !OGE +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ult <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmultz4xfloat(<4 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmultz2xdouble(<2 x double> %A) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmulez2xfloat(<2 x float> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ule <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmulez4xfloat(<4 x float> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmulez2xdouble(<2 x double> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmunez2xfloat(<2 x float> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp une <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmunez4xfloat(<4 x float> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmunez2xdouble(<2 x double> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uno <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 + +} diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll new file mode 100644 index 0000000000000..e18530e6ff8e0 --- /dev/null +++ b/test/CodeGen/AArch64/neon-copy.ll @@ -0,0 +1,615 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + + +define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[15], {{w[0-31]+}} + %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[6], {{w[0-31]+}} + %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[2], {{w[0-31]+}} + %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{x[0-31]+}} + %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1 + ret <2 x i64> %tmp3 +} + +define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[5], {{w[0-31]+}} + %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[3], {{w[0-31]+}} + %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{w[0-31]+}} + %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1 + ret <2 x i32> %tmp3 +} + +define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <16 x i8> %tmp1, i32 2 + %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <8 x i16> %tmp1, i32 2 + %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x i32> %tmp1, i32 2 + %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x i64> %tmp1, i32 0 + %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 + ret <2 x i64> %tmp4 +} + +define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x float> %tmp1, i32 2 + %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 + ret <4 x float> %tmp4 +} + +define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x double> %tmp1, i32 0 + %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 + ret <2 x double> %tmp4 +} + +define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <8 x i8> %tmp1, i32 2 + %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <4 x i16> %tmp1, i32 2 + %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1] + %tmp3 = extractelement <2 x i32> %tmp1, i32 1 + %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x i64> %tmp1, i32 0 + %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 + ret <2 x i64> %tmp4 +} + +define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1] + %tmp3 = extractelement <2 x float> %tmp1, i32 1 + %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 + ret <4 x float> %tmp4 +} + +define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x double> %tmp1, i32 0 + %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 + ret <2 x double> %tmp4 +} + +define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[7], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <16 x i8> %tmp1, i32 2 + %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <8 x i16> %tmp1, i32 2 + %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x i32> %tmp1, i32 2 + %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x i64> %tmp1, i32 0 + %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 + ret <1 x i64> %tmp4 +} + +define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x float> %tmp1, i32 2 + %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 + ret <2 x float> %tmp4 +} + +define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x double> %tmp1, i32 0 + %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 + ret <1 x double> %tmp4 +} + +define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[4], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <8 x i8> %tmp1, i32 2 + %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <4 x i16> %tmp1, i32 2 + %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0] + %tmp3 = extractelement <2 x i32> %tmp1, i32 0 + %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x i64> %tmp1, i32 0 + %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 + ret <1 x i64> %tmp4 +} + +define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0] + %tmp3 = extractelement <2 x float> %tmp1, i32 0 + %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1 + ret <2 x float> %tmp4 +} + +define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x double> %tmp1, i32 0 + %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0 + ret <1 x double> %tmp4 +} + +define i32 @umovw16b(<16 x i8> %tmp1) { +;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[8] + %tmp3 = extractelement <16 x i8> %tmp1, i32 8 + %tmp4 = zext i8 %tmp3 to i32 + ret i32 %tmp4 +} + +define i32 @umovw8h(<8 x i16> %tmp1) { +;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2] + %tmp3 = extractelement <8 x i16> %tmp1, i32 2 + %tmp4 = zext i16 %tmp3 to i32 + ret i32 %tmp4 +} + +define i32 @umovw4s(<4 x i32> %tmp1) { +;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x i32> %tmp1, i32 2 + ret i32 %tmp3 +} + +define i64 @umovx2d(<2 x i64> %tmp1) { +;CHECK: umov {{x[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x i64> %tmp1, i32 0 + ret i64 %tmp3 +} + +define i32 @umovw8b(<8 x i8> %tmp1) { +;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[7] + %tmp3 = extractelement <8 x i8> %tmp1, i32 7 + %tmp4 = zext i8 %tmp3 to i32 + ret i32 %tmp4 +} + +define i32 @umovw4h(<4 x i16> %tmp1) { +;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2] + %tmp3 = extractelement <4 x i16> %tmp1, i32 2 + %tmp4 = zext i16 %tmp3 to i32 + ret i32 %tmp4 +} + +define i32 @umovw2s(<2 x i32> %tmp1) { +;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp3 = extractelement <2 x i32> %tmp1, i32 1 + ret i32 %tmp3 +} + +define i64 @umovx1d(<1 x i64> %tmp1) { +;CHECK: fmov {{x[0-31]+}}, {{d[0-31]+}} + %tmp3 = extractelement <1 x i64> %tmp1, i32 0 + ret i64 %tmp3 +} + +define i32 @smovw16b(<16 x i8> %tmp1) { +;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[8] + %tmp3 = extractelement <16 x i8> %tmp1, i32 8 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = add i32 5, %tmp4 + ret i32 %tmp5 +} + +define i32 @smovw8h(<8 x i16> %tmp1) { +;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2] + %tmp3 = extractelement <8 x i16> %tmp1, i32 2 + %tmp4 = sext i16 %tmp3 to i32 + %tmp5 = add i32 5, %tmp4 + ret i32 %tmp5 +} + +define i32 @smovx16b(<16 x i8> %tmp1) { +;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[8] + %tmp3 = extractelement <16 x i8> %tmp1, i32 8 + %tmp4 = sext i8 %tmp3 to i32 + ret i32 %tmp4 +} + +define i32 @smovx8h(<8 x i16> %tmp1) { +;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2] + %tmp3 = extractelement <8 x i16> %tmp1, i32 2 + %tmp4 = sext i16 %tmp3 to i32 + ret i32 %tmp4 +} + +define i64 @smovx4s(<4 x i32> %tmp1) { +;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x i32> %tmp1, i32 2 + %tmp4 = sext i32 %tmp3 to i64 + ret i64 %tmp4 +} + +define i32 @smovw8b(<8 x i8> %tmp1) { +;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[4] + %tmp3 = extractelement <8 x i8> %tmp1, i32 4 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = add i32 5, %tmp4 + ret i32 %tmp5 +} + +define i32 @smovw4h(<4 x i16> %tmp1) { +;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2] + %tmp3 = extractelement <4 x i16> %tmp1, i32 2 + %tmp4 = sext i16 %tmp3 to i32 + %tmp5 = add i32 5, %tmp4 + ret i32 %tmp5 +} + +define i32 @smovx8b(<8 x i8> %tmp1) { +;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[6] + %tmp3 = extractelement <8 x i8> %tmp1, i32 6 + %tmp4 = sext i8 %tmp3 to i32 + ret i32 %tmp4 +} + +define i32 @smovx4h(<4 x i16> %tmp1) { +;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2] + %tmp3 = extractelement <4 x i16> %tmp1, i32 2 + %tmp4 = sext i16 %tmp3 to i32 + ret i32 %tmp4 +} + +define i64 @smovx2s(<2 x i32> %tmp1) { +;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp3 = extractelement <2 x i32> %tmp1, i32 1 + %tmp4 = sext i32 %tmp3 to i64 + ret i64 %tmp4 +} + +define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3] + %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7> + ret <8 x i8> %vset_lane +} + +define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6] + %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15> + ret <16 x i8> %vset_lane +} + +define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0] + %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0> + ret <8 x i8> %vset_lane +} + +define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15] + %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vset_lane +} + +define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} + %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0 + %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1 + %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2 + %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3 + %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4 + %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5 + %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6 + %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7 + ret <8 x i8> %vecinit7.i +} + +define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} + %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3 + ret <4 x i16> %vecinit3.i +} + +define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}} + %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0 + %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1 + ret <2 x i32> %vecinit1.i +} + +define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 { +;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0 + ret <1 x i64> %vecinit.i +} + +define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}} + %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0 + %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1 + %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2 + %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3 + %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4 + %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5 + %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6 + %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7 + %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8 + %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9 + %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10 + %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11 + %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12 + %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13 + %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14 + %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15 + ret <16 x i8> %vecinit15.i +} + +define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} + %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0 + %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1 + %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2 + %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3 + %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4 + %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5 + %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6 + %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7 + ret <8 x i16> %vecinit7.i +} + +define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} + %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0 + %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1 + %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2 + %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3 + ret <4 x i32> %vecinit3.i +} + +define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}} + %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0 + %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1 + ret <2 x i64> %vecinit1.i +} + +define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> + ret <8 x i8> %shuffle +} + +define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> + ret <4 x i16> %shuffle +} + +define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1> + ret <2 x i32> %shuffle +} + +define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 { +;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 { +;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 { +;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %shuffle +} + +define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 { +;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] + %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %shuffle +} + +define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> + ret <8 x i8> %shuffle +} + +define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> + ret <4 x i16> %shuffle +} + +define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1> + ret <2 x i32> %shuffle +} + +define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 { +;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %shuffle +} + +define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] + %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %shuffle +} + +define i64 @test_bitcastv8i8toi64(<8 x i8> %in) { +; CHECK-LABEL: test_bitcastv8i8toi64: + %res = bitcast <8 x i8> %in to i64 +; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} + ret i64 %res +} + +define i64 @test_bitcastv4i16toi64(<4 x i16> %in) { +; CHECK-LABEL: test_bitcastv4i16toi64: + %res = bitcast <4 x i16> %in to i64 +; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} + ret i64 %res +} + +define i64 @test_bitcastv2i32toi64(<2 x i32> %in) { +; CHECK-LABEL: test_bitcastv2i32toi64: + %res = bitcast <2 x i32> %in to i64 +; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} + ret i64 %res +} + +define i64 @test_bitcastv2f32toi64(<2 x float> %in) { +; CHECK-LABEL: test_bitcastv2f32toi64: + %res = bitcast <2 x float> %in to i64 +; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} + ret i64 %res +} + +define i64 @test_bitcastv1i64toi64(<1 x i64> %in) { +; CHECK-LABEL: test_bitcastv1i64toi64: + %res = bitcast <1 x i64> %in to i64 +; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} + ret i64 %res +} + +define i64 @test_bitcastv1f64toi64(<1 x double> %in) { +; CHECK-LABEL: test_bitcastv1f64toi64: + %res = bitcast <1 x double> %in to i64 +; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}} + ret i64 %res +} + +define <8 x i8> @test_bitcasti64tov8i8(i64 %in) { +; CHECK-LABEL: test_bitcasti64tov8i8: + %res = bitcast i64 %in to <8 x i8> +; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + ret <8 x i8> %res +} + +define <4 x i16> @test_bitcasti64tov4i16(i64 %in) { +; CHECK-LABEL: test_bitcasti64tov4i16: + %res = bitcast i64 %in to <4 x i16> +; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + ret <4 x i16> %res +} + +define <2 x i32> @test_bitcasti64tov2i32(i64 %in) { +; CHECK-LABEL: test_bitcasti64tov2i32: + %res = bitcast i64 %in to <2 x i32> +; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + ret <2 x i32> %res +} + +define <2 x float> @test_bitcasti64tov2f32(i64 %in) { +; CHECK-LABEL: test_bitcasti64tov2f32: + %res = bitcast i64 %in to <2 x float> +; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + ret <2 x float> %res +} + +define <1 x i64> @test_bitcasti64tov1i64(i64 %in) { +; CHECK-LABEL: test_bitcasti64tov1i64: + %res = bitcast i64 %in to <1 x i64> +; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + ret <1 x i64> %res +} + +define <1 x double> @test_bitcasti64tov1f64(i64 %in) { +; CHECK-LABEL: test_bitcasti64tov1f64: + %res = bitcast i64 %in to <1 x double> +; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + ret <1 x double> %res +}
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll new file mode 100644 index 0000000000000..0283e0e7ca2ec --- /dev/null +++ b/test/CodeGen/AArch64/neon-crypto.ll @@ -0,0 +1,149 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s +; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s + +declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32>, <1 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32>, <1 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32>, <1 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>) #1 + +declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>) #1 + +declare <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32>) #1 + +declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>) #1 + +declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>) #1 + +declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>) #1 + +declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>) #1 + +define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) { +; CHECK: test_vaeseq_u8: +; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese +entry: + %aese.i = tail call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %data, <16 x i8> %key) + ret <16 x i8> %aese.i +} + +define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) { +; CHECK: test_vaesdq_u8: +; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %data, <16 x i8> %key) + ret <16 x i8> %aesd.i +} + +define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) { +; CHECK: test_vaesmcq_u8: +; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %data) + ret <16 x i8> %aesmc.i +} + +define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) { +; CHECK: test_vaesimcq_u8: +; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %data) + ret <16 x i8> %aesimc.i +} + +define i32 @test_vsha1h_u32(i32 %hash_e) { +; CHECK: test_vsha1h_u32: +; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}} +entry: + %sha1h.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 + %sha1h1.i = tail call <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32> %sha1h.i) + %0 = extractelement <1 x i32> %sha1h1.i, i32 0 + ret i32 %0 +} + +define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) { +; CHECK: test_vsha1su1q_u32: +; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w12_15) + ret <4 x i32> %sha1su12.i +} + +define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) { +; CHECK: test_vsha256su0q_u32: +; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7) + ret <4 x i32> %sha256su02.i +} + +define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { +; CHECK: test_vsha1cq_u32: +; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s +entry: + %sha1c.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 + %sha1c1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32> %hash_abcd, <1 x i32> %sha1c.i, <4 x i32> %wk) + ret <4 x i32> %sha1c1.i +} + +define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { +; CHECK: test_vsha1pq_u32: +; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s +entry: + %sha1p.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 + %sha1p1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32> %hash_abcd, <1 x i32> %sha1p.i, <4 x i32> %wk) + ret <4 x i32> %sha1p1.i +} + +define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { +; CHECK: test_vsha1mq_u32: +; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s +entry: + %sha1m.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0 + %sha1m1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32> %hash_abcd, <1 x i32> %sha1m.i, <4 x i32> %wk) + ret <4 x i32> %sha1m1.i +} + +define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) { +; CHECK: test_vsha1su0q_u32: +; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) + ret <4 x i32> %sha1su03.i +} + +define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) { +; CHECK: test_vsha256hq_u32: +; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s +entry: + %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) + ret <4 x i32> %sha256h3.i +} + +define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) { +; CHECK: test_vsha256h2q_u32: +; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s +entry: + %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) + ret <4 x i32> %sha256h23.i +} + +define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) { +; CHECK: test_vsha256su1q_u32: +; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) + ret <4 x i32> %sha256su13.i +} + diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll new file mode 100644 index 0000000000000..f546aa7d33414 --- /dev/null +++ b/test/CodeGen/AArch64/neon-diagnostics.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfma_lane_f32: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> + %mul = fmul <2 x float> %shuffle, %b + %add = fadd <2 x float> %mul, %a + ret <2 x float> %add +} + +define <4 x i32> @test_vshrn_not_match(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vshrn_not_match +; CHECK-NOT: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #35 + %1 = bitcast <2 x i32> %a to <1 x i64> + %2 = ashr <2 x i64> %b, <i64 35, i64 35> + %vshrn_n = trunc <2 x i64> %2 to <2 x i32> + %3 = bitcast <2 x i32> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1> + %4 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %4 +}
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll new file mode 100644 index 0000000000000..5c52cd30676a2 --- /dev/null +++ b/test/CodeGen/AArch64/neon-extract.ll @@ -0,0 +1,190 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vext_s8: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2 +entry: + %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> + ret <8 x i8> %vext +} + +define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vext_s16: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6 +entry: + %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> + ret <4 x i16> %vext +} + +define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vext_s32: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4 +entry: + %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2> + ret <2 x i32> %vext +} + +define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) { +; CHECK: test_vext_s64: +entry: + %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0> + ret <1 x i64> %vext +} + +define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vextq_s8: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2 +entry: + %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17> + ret <16 x i8> %vext +} + +define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vextq_s16: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6 +entry: + %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + ret <8 x i16> %vext +} + +define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vextq_s32: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4 +entry: + %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> + ret <4 x i32> %vext +} + +define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vextq_s64: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8 +entry: + %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> + ret <2 x i64> %vext +} + +define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vext_u8: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2 +entry: + %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> + ret <8 x i8> %vext +} + +define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vext_u16: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6 +entry: + %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> + ret <4 x i16> %vext +} + +define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vext_u32: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4 +entry: + %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2> + ret <2 x i32> %vext +} + +define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) { +; CHECK: test_vext_u64: +entry: + %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0> + ret <1 x i64> %vext +} + +define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vextq_u8: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2 +entry: + %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17> + ret <16 x i8> %vext +} + +define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vextq_u16: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6 +entry: + %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + ret <8 x i16> %vext +} + +define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vextq_u32: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4 +entry: + %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> + ret <4 x i32> %vext +} + +define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vextq_u64: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8 +entry: + %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> + ret <2 x i64> %vext +} + +define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vext_f32: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4 +entry: + %vext = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 2> + ret <2 x float> %vext +} + +define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) { +; CHECK: test_vext_f64: +entry: + %vext = shufflevector <1 x double> %a, <1 x double> %b, <1 x i32> <i32 0> + ret <1 x double> %vext +} + +define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vextq_f32: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4 +entry: + %vext = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> + ret <4 x float> %vext +} + +define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) { +; CHECK: test_vextq_f64: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8 +entry: + %vext = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2> + ret <2 x double> %vext +} + +define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vext_p8: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2 +entry: + %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> + ret <8 x i8> %vext +} + +define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vext_p16: +; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6 +entry: + %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> + ret <4 x i16> %vext +} + +define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vextq_p8: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2 +entry: + %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17> + ret <16 x i8> %vext +} + +define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vextq_p16: +; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6 +entry: + %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + ret <8 x i16> %vext +} diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll new file mode 100644 index 0000000000000..146256e4be112 --- /dev/null +++ b/test/CodeGen/AArch64/neon-facge-facgt.ll @@ -0,0 +1,56 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double>, <2 x double>) + +define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v2i32: + %val = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %A, <2 x float> %B) +; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + ret <2 x i32> %val +} +define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v4i32: + %val = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %A, <4 x float> %B) +; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + ret <4 x i32> %val +} + +define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v2i64: + %val = call <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double> %A, <2 x double> %B) +; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret <2 x i64> %val +} + +declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double>, <2 x double>) + +define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v2i32: + %val = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %A, <2 x float> %B) +; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + ret <2 x i32> %val +} +define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v4i32: + %val = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %A, <4 x float> %B) +; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + ret <4 x i32> %val +} + +define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v2i64: + %val = call <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double> %A, <2 x double> %B) +; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret <2 x i64> %val +} + diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll new file mode 100644 index 0000000000000..dcf4e2878068a --- /dev/null +++ b/test/CodeGen/AArch64/neon-fma.ll @@ -0,0 +1,112 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fadd <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <4 x float> @fmla4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = fmul <4 x float> %A, %B; + %tmp2 = fadd <4 x float> %C, %tmp1; + ret <4 x float> %tmp2 +} + +define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp1 = fmul <2 x double> %A, %B; + %tmp2 = fadd <2 x double> %C, %tmp1; + ret <2 x double> %tmp2 +} + + +define <2 x float> @fmls2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fsub <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <4 x float> @fmls4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = fmul <4 x float> %A, %B; + %tmp2 = fsub <4 x float> %C, %tmp1; + ret <4 x float> %tmp2 +} + +define <2 x double> @fmls2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp1 = fmul <2 x double> %A, %B; + %tmp2 = fsub <2 x double> %C, %tmp1; + ret <2 x double> %tmp2 +} + + +; Another set of tests for when the intrinsic is used. + +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define <2 x float> @fmla2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmla4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmla2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} + +define <2 x float> @fmls2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %negA = fsub <2 x float> <float -0.0, float -0.0>, %A + %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %negA, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmls4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %negA = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %A + %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %negA, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmls2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %negA = fsub <2 x double> <double -0.0, double -0.0>, %A + %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %negA, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} + +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define <2 x float> @fmuladd2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %val = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmuladd4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %val = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmuladd2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %val = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll new file mode 100644 index 0000000000000..46fe25d74d9d6 --- /dev/null +++ b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll @@ -0,0 +1,54 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; Set of tests for when the intrinsic is used. + +declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll new file mode 100644 index 0000000000000..a8f59dbdb0adb --- /dev/null +++ b/test/CodeGen/AArch64/neon-halving-add-sub.ll @@ -0,0 +1,207 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_shadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: shadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_shadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: shadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_shadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: shadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_shadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: shadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_shadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: shadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_shadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: shadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + +declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uhsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uhsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_shsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: shsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uhsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uhsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_shsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: shsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uhsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uhsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_shsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: shsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uhsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uhsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_shsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: shsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uhsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uhsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_shsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: shsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uhsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uhsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_shsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: shsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll new file mode 100644 index 0000000000000..d757aca86a69b --- /dev/null +++ b/test/CodeGen/AArch64/neon-max-min-pairwise.ll @@ -0,0 +1,310 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smaxp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smaxp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umaxp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smaxp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smaxp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umaxp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umaxp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smaxp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smaxp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umaxp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umaxp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smaxp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smaxp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umaxp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umaxp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smaxp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smaxp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umaxp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umaxp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smaxp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smaxp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umaxp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umaxp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_sminp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sminp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uminp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sminp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sminp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uminp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uminp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sminp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sminp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uminp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uminp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sminp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sminp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uminp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uminp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sminp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sminp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uminp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uminp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sminp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sminp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uminp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uminp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxnmp_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxnmp_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxnmp_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminnmp_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminnmp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminnmp_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminnmp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminnmp_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminnmp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll new file mode 100644 index 0000000000000..7889c77e37f1f --- /dev/null +++ b/test/CodeGen/AArch64/neon-max-min.ll @@ -0,0 +1,310 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smax_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smax v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umax v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smax_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smax v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umax_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umax v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smax_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smax v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umax_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umax v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smax_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smax v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umax_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umax v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smax_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smax v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umax_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umax v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smax_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smax v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umax_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umax v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smin_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smin v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umin v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smin_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smin v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umin_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umin v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smin_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smin v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umin_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umin v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smin_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smin v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umin_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umin v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smin_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smin v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umin_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umin v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smin_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smin v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umin_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umin v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmax_v2f32: + %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmax v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmax_v4f32: + %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmax v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmax_v2f64: + %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmax v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmin_v2f32: + %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmin v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmin_v4f32: + %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmin v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmin_v2f64: + %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmin v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + + +declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxnm_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxnm v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxnm_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxnm v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxnm_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxnm v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminnm_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminnm v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminnm_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminnm v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminnm_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminnm v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-misc-scalar.ll b/test/CodeGen/AArch64/neon-misc-scalar.ll new file mode 100644 index 0000000000000..cca8deb45cbae --- /dev/null +++ b/test/CodeGen/AArch64/neon-misc-scalar.ll @@ -0,0 +1,60 @@ +;RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>) + +declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>) + +declare <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64>) + +declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) + +declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) { +entry: + ; CHECK: test_vuqadd_s64 + %vuqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b) + ; CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}} + ret <1 x i64> %vuqadd2.i +} + +define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) { +entry: + ; CHECK: test_vsqadd_u64 + %vsqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b) + ; CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}} + ret <1 x i64> %vsqadd2.i +} + +define <1 x i64> @test_vabs_s64(<1 x i64> %a) { + ; CHECK: test_vabs_s64 +entry: + %vabs1.i = tail call <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64> %a) + ; CHECK: abs d{{[0-9]+}}, d{{[0-9]+}} + ret <1 x i64> %vabs1.i +} + +define <1 x i64> @test_vqabs_s64(<1 x i64> %a) { + ; CHECK: test_vqabs_s64 +entry: + %vqabs1.i = tail call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %a) + ; CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}} + ret <1 x i64> %vqabs1.i +} + +define <1 x i64> @test_vqneg_s64(<1 x i64> %a) { + ; CHECK: test_vqneg_s64 +entry: + %vqneg1.i = tail call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %a) + ; CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}} + ret <1 x i64> %vqneg1.i +} + +define <1 x i64> @test_vneg_s64(<1 x i64> %a) { + ; CHECK: test_vneg_s64 +entry: + %sub.i = sub <1 x i64> zeroinitializer, %a + ; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}} + ret <1 x i64> %sub.i +} + diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll new file mode 100644 index 0000000000000..9660bf2c7a304 --- /dev/null +++ b/test/CodeGen/AArch64/neon-misc.ll @@ -0,0 +1,1799 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + + +define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 { +; CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 { +; CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> + ret <16 x i8> %shuffle.i +} + +define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 { +; CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 { +; CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + ret <4 x i16> %shuffle.i +} + +define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 { +; CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> + ret <16 x i8> %shuffle.i +} + +define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 { +; CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0> + ret <2 x i32> %shuffle.i +} + +define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0> + ret <2 x float> %shuffle.i +} + +define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> + ret <16 x i8> %shuffle.i +} + +define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + ret <4 x i32> %shuffle.i +} + +define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 { +; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + ret <4 x float> %shuffle.i +} + +define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 { +; CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b + %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4 + ret <4 x i16> %vpaddl.i +} + +define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 { +; CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h + %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4 + ret <2 x i32> %vpaddl1.i +} + +define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 { +; CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s + %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4 + ret <1 x i64> %vpaddl1.i +} + +define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 { +; CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b + %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4 + ret <4 x i16> %vpaddl.i +} + +define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 { +; CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h + %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4 + ret <2 x i32> %vpaddl1.i +} + +define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 { +; CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s + %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4 + ret <1 x i64> %vpaddl1.i +} + +define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 { +; CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b + %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4 + ret <8 x i16> %vpaddl.i +} + +define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 { +; CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h + %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4 + ret <4 x i32> %vpaddl1.i +} + +define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 { +; CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s + %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4 + ret <2 x i64> %vpaddl1.i +} + +define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 { +; CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b + %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4 + ret <8 x i16> %vpaddl.i +} + +define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 { +; CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h + %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4 + ret <4 x i32> %vpaddl1.i +} + +define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 { +; CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s + %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4 + ret <2 x i64> %vpaddl1.i +} + +define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 { +; CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b + %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4 + ret <4 x i16> %vpadal1.i +} + +define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 { +; CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h + %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4 + ret <2 x i32> %vpadal2.i +} + +define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 { +; CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s + %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4 + ret <1 x i64> %vpadal2.i +} + +define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 { +; CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b + %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4 + ret <4 x i16> %vpadal1.i +} + +define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 { +; CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h + %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4 + ret <2 x i32> %vpadal2.i +} + +define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 { +; CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s + %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4 + ret <1 x i64> %vpadal2.i +} + +define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 { +; CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b + %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4 + ret <8 x i16> %vpadal1.i +} + +define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 { +; CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h + %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4 + ret <4 x i32> %vpadal2.i +} + +define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 { +; CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s + %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4 + ret <2 x i64> %vpadal2.i +} + +define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 { +; CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b + %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4 + ret <8 x i16> %vpadal1.i +} + +define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 { +; CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h + %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4 + ret <4 x i32> %vpadal2.i +} + +define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 { +; CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s + %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4 + ret <2 x i64> %vpadal2.i +} + +define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 { +; CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vqabs.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4 + ret <8 x i8> %vqabs.i +} + +define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 { +; CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vqabs.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4 + ret <16 x i8> %vqabs.i +} + +define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 { +; CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %vqabs1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4 + ret <4 x i16> %vqabs1.i +} + +define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 { +; CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %vqabs1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4 + ret <8 x i16> %vqabs1.i +} + +define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 { +; CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vqabs1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4 + ret <2 x i32> %vqabs1.i +} + +define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 { +; CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vqabs1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4 + ret <4 x i32> %vqabs1.i +} + +define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 { +; CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vqabs1.i = tail call <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64> %a) #4 + ret <2 x i64> %vqabs1.i +} + +define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 { +; CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vqneg.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4 + ret <8 x i8> %vqneg.i +} + +define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 { +; CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vqneg.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4 + ret <16 x i8> %vqneg.i +} + +define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 { +; CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %vqneg1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4 + ret <4 x i16> %vqneg1.i +} + +define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 { +; CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %vqneg1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4 + ret <8 x i16> %vqneg1.i +} + +define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 { +; CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vqneg1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4 + ret <2 x i32> %vqneg1.i +} + +define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 { +; CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vqneg1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4 + ret <4 x i32> %vqneg1.i +} + +define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 { +; CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vqneg1.i = tail call <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64> %a) #4 + ret <2 x i64> %vqneg1.i +} + +define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 { +; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %sub.i = sub <8 x i8> zeroinitializer, %a + ret <8 x i8> %sub.i +} + +define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 { +; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %sub.i = sub <16 x i8> zeroinitializer, %a + ret <16 x i8> %sub.i +} + +define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 { +; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %sub.i = sub <4 x i16> zeroinitializer, %a + ret <4 x i16> %sub.i +} + +define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 { +; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %sub.i = sub <8 x i16> zeroinitializer, %a + ret <8 x i16> %sub.i +} + +define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 { +; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %sub.i = sub <2 x i32> zeroinitializer, %a + ret <2 x i32> %sub.i +} + +define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 { +; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %sub.i = sub <4 x i32> zeroinitializer, %a + ret <4 x i32> %sub.i +} + +define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 { +; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %sub.i = sub <2 x i64> zeroinitializer, %a + ret <2 x i64> %sub.i +} + +define <2 x float> @test_vneg_f32(<2 x float> %a) #0 { +; CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a + ret <2 x float> %sub.i +} + +define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 { +; CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a + ret <4 x float> %sub.i +} + +define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 { +; CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a + ret <2 x double> %sub.i +} + +define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 { +; CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4 + ret <8 x i8> %vabs.i +} + +define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 { +; CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4 + ret <16 x i8> %vabs.i +} + +define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 { +; CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4 + ret <4 x i16> %vabs1.i +} + +define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 { +; CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4 + ret <8 x i16> %vabs1.i +} + +define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 { +; CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4 + ret <2 x i32> %vabs1.i +} + +define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 { +; CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4 + ret <4 x i32> %vabs1.i +} + +define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 { +; CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vabs1.i = tail call <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64> %a) #4 + ret <2 x i64> %vabs1.i +} + +define <2 x float> @test_vabs_f32(<2 x float> %a) #1 { +; CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4 + ret <2 x float> %vabs1.i +} + +define <4 x float> @test_vabsq_f32(<4 x float> %a) #1 { +; CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4 + ret <4 x float> %vabs1.i +} + +define <2 x double> @test_vabsq_f64(<2 x double> %a) #1 { +; CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vabs1.i = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #4 + ret <2 x double> %vabs1.i +} + +define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 { +; CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vuqadd.i = tail call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4 + ret <8 x i8> %vuqadd.i +} + +define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vuqadd.i = tail call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4 + ret <16 x i8> %vuqadd.i +} + +define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 { +; CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %vuqadd2.i = tail call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4 + ret <4 x i16> %vuqadd2.i +} + +define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %vuqadd2.i = tail call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4 + ret <8 x i16> %vuqadd2.i +} + +define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vuqadd2.i = tail call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4 + ret <2 x i32> %vuqadd2.i +} + +define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vuqadd2.i = tail call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4 + ret <4 x i32> %vuqadd2.i +} + +define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vuqadd2.i = tail call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4 + ret <2 x i64> %vuqadd2.i +} + +define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 { +; CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vcls.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4 + ret <8 x i8> %vcls.i +} + +define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 { +; CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vcls.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4 + ret <16 x i8> %vcls.i +} + +define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 { +; CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %vcls1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4 + ret <4 x i16> %vcls1.i +} + +define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 { +; CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %vcls1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4 + ret <8 x i16> %vcls1.i +} + +define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 { +; CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcls1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4 + ret <2 x i32> %vcls1.i +} + +define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 { +; CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcls1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4 + ret <4 x i32> %vcls1.i +} + +define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 { +; CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4 + ret <8 x i8> %vclz.i +} + +define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 { +; CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4 + ret <16 x i8> %vclz.i +} + +define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 { +; CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h + %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4 + ret <4 x i16> %vclz1.i +} + +define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 { +; CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h + %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4 + ret <8 x i16> %vclz1.i +} + +define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 { +; CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4 + ret <2 x i32> %vclz1.i +} + +define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 { +; CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4 + ret <4 x i32> %vclz1.i +} + +define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 { +; CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vctpop.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4 + ret <8 x i8> %vctpop.i +} + +define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 { +; CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vctpop.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4 + ret <16 x i8> %vctpop.i +} + +define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 { +; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + ret <8 x i8> %neg.i +} + +define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 { +; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + ret <16 x i8> %neg.i +} + +define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 { +; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1> + ret <4 x i16> %neg.i +} + +define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 { +; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + ret <8 x i16> %neg.i +} + +define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 { +; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1> + ret <2 x i32> %neg.i +} + +define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 { +; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1> + ret <4 x i32> %neg.i +} + +define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 { +; CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %vrbit.i = tail call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #4 + ret <8 x i8> %vrbit.i +} + +define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 { +; CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b + %vrbit.i = tail call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #4 + ret <16 x i8> %vrbit.i +} + +define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 { +; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h + %vmovn.i = trunc <8 x i16> %a to <8 x i8> + ret <8 x i8> %vmovn.i +} + +define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 { +; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s + %vmovn.i = trunc <4 x i32> %a to <4 x i16> + ret <4 x i16> %vmovn.i +} + +define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 { +; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vmovn.i = trunc <2 x i64> %a to <2 x i32> + ret <2 x i32> %vmovn.i +} + +define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 { +; CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h + %vmovn.i.i = trunc <8 x i16> %b to <8 x i8> + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vmovn.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle.i +} + +define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 { +; CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s + %vmovn.i.i = trunc <4 x i32> %b to <4 x i16> + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vmovn.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 { +; CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d + %vmovn.i.i = trunc <2 x i64> %b to <2 x i32> + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vmovn.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 { +; CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h + %vqdmull1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4 + ret <8 x i8> %vqdmull1.i +} + +define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 { +; CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s + %vqdmull1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4 + ret <4 x i16> %vqdmull1.i +} + +define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 { +; CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vqdmull1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4 + ret <2 x i32> %vqdmull1.i +} + +define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 { +; CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h + %vqdmull1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %b) #4 + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqdmull1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle.i +} + +define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 { +; CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s + %vqdmull1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %b) #4 + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqdmull1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 { +; CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d + %vqdmull1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %b) #4 + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqdmull1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 { +; CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h + %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4 + ret <8 x i8> %vqmovn1.i +} + +define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 { +; CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s + %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4 + ret <4 x i16> %vqmovn1.i +} + +define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 { +; CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4 + ret <2 x i32> %vqmovn1.i +} + +define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 { +; CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h + %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %b) #4 + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle.i +} + +define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 { +; CHECK: test_vqmovn_high_s32 + %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %b) #4 + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 { +; CHECK: test_vqmovn_high_s64 + %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %b) #4 + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %shuffle.i +} + +define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 { +; CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h + %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4 + ret <8 x i8> %vqmovn1.i +} + +define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 { +; CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s + %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4 + ret <4 x i16> %vqmovn1.i +} + +define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 { +; CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4 + ret <2 x i32> %vqmovn1.i +} + +define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 { +; CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h + %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %b) #4 + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle.i +} + +define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 { +; CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s + %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %b) #4 + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle.i +} + +define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 { +; CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d + %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %b) #4 + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %shuffle.i +} + +define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 { +; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8 + %1 = sext <8 x i8> %a to <8 x i16> + %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + ret <8 x i16> %vshll_n +} + +define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 { +; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16 + %1 = sext <4 x i16> %a to <4 x i32> + %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16> + ret <4 x i32> %vshll_n +} + +define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 { +; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32 + %1 = sext <2 x i32> %a to <2 x i64> + %vshll_n = shl <2 x i64> %1, <i64 32, i64 32> + ret <2 x i64> %vshll_n +} + +define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 { +; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8 + %1 = zext <8 x i8> %a to <8 x i16> + %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + ret <8 x i16> %vshll_n +} + +define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 { +; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16 + %1 = zext <4 x i16> %a to <4 x i32> + %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16> + ret <4 x i32> %vshll_n +} + +define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 { +; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32 + %1 = zext <2 x i32> %a to <2 x i64> + %vshll_n = shl <2 x i64> %1, <i64 32, i64 32> + ret <2 x i64> %vshll_n +} + +define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 { +; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8 + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %1 = sext <8 x i8> %shuffle.i to <8 x i16> + %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + ret <8 x i16> %vshll_n +} + +define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 { +; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16 + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %1 = sext <4 x i16> %shuffle.i to <4 x i32> + %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16> + ret <4 x i32> %vshll_n +} + +define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 { +; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32 + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %1 = sext <2 x i32> %shuffle.i to <2 x i64> + %vshll_n = shl <2 x i64> %1, <i64 32, i64 32> + ret <2 x i64> %vshll_n +} + +define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 { +; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8 + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %1 = zext <8 x i8> %shuffle.i to <8 x i16> + %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + ret <8 x i16> %vshll_n +} + +define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 { +; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16 + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %1 = zext <4 x i16> %shuffle.i to <4 x i32> + %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16> + ret <4 x i32> %vshll_n +} + +define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 { +; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32 + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %1 = zext <2 x i32> %shuffle.i to <2 x i64> + %vshll_n = shl <2 x i64> %1, <i64 32, i64 32> + ret <2 x i64> %vshll_n +} + +define <4 x i16> @test_vcvt_f16_f32(<4 x float> %a) #0 { +; CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s + %vcvt1.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4 + ret <4 x i16> %vcvt1.i +} + +define <8 x i16> @test_vcvt_high_f16_f32(<4 x i16> %a, <4 x float> %b) #0 { +; CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s + %vcvt1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %b) #4 + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vcvt1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle.i +} + +define <4 x float> @test_vcvt_f32_f16(<4 x i16> %a) #0 { +; CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h + %vcvt1.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %a) #4 + ret <4 x float> %vcvt1.i +} + +define <4 x float> @test_vcvt_high_f32_f16(<8 x i16> %a) #0 { +; CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vcvt1.i.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %shuffle.i.i) #4 + ret <4 x float> %vcvt1.i.i +} + +define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 { +; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vcvt.i = fptrunc <2 x double> %a to <2 x float> + ret <2 x float> %vcvt.i +} + +define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 { +; CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d + %vcvt.i.i = fptrunc <2 x double> %b to <2 x float> + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvt.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %shuffle.i +} + +define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 { +; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d + %vcvtx_f32_f641.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) #4 + ret <2 x float> %vcvtx_f32_f641.i +} + +define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 { +; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d + %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) #4 + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 { +; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s + %vcvt.i = fpext <2 x float> %a to <2 x double> + ret <2 x double> %vcvt.i +} + +define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 { +; CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s + %shuffle.i.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3> + %vcvt.i.i = fpext <2 x float> %shuffle.i.i to <2 x double> + ret <2 x double> %vcvt.i.i +} + +define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 { +; CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrndn1.i = tail call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrndn1.i +} + +define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 { +; CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrndn1.i = tail call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrndn1.i +} + +define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 { +; CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrndn1.i = tail call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrndn1.i +} + +define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 { +; CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrnda1.i = tail call <2 x float> @llvm.round.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrnda1.i +} + +define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 { +; CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrnda1.i = tail call <4 x float> @llvm.round.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrnda1.i +} + +define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 { +; CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrnda1.i = tail call <2 x double> @llvm.round.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrnda1.i +} + +define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 { +; CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrndp1.i = tail call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrndp1.i +} + +define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 { +; CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrndp1.i = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrndp1.i +} + +define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 { +; CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrndp1.i = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrndp1.i +} + +define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 { +; CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrndm1.i = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrndm1.i +} + +define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 { +; CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrndm1.i = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrndm1.i +} + +define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 { +; CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrndm1.i = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrndm1.i +} + +define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 { +; CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrndx1.i = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrndx1.i +} + +define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 { +; CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrndx1.i = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrndx1.i +} + +define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 { +; CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrndx1.i = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrndx1.i +} + +define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 { +; CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrnd1.i = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrnd1.i +} + +define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 { +; CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrnd1.i = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrnd1.i +} + +define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 { +; CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrnd1.i = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrnd1.i +} + +define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 { +; CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrndi1.i = tail call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrndi1.i +} + +define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 { +; CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrndi1.i = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrndi1.i +} + +define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 { +; CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrndi1.i = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrndi1.i +} + +define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 { +; CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvt.i = fptosi <2 x float> %a to <2 x i32> + ret <2 x i32> %vcvt.i +} + +define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 { +; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvt.i = fptosi <4 x float> %a to <4 x i32> + ret <4 x i32> %vcvt.i +} + +define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 { +; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = fptosi <2 x double> %a to <2 x i64> + ret <2 x i64> %vcvt.i +} + +define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 { +; CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvt.i = fptoui <2 x float> %a to <2 x i32> + ret <2 x i32> %vcvt.i +} + +define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 { +; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvt.i = fptoui <4 x float> %a to <4 x i32> + ret <4 x i32> %vcvt.i +} + +define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 { +; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = fptoui <2 x double> %a to <2 x i64> + ret <2 x i64> %vcvt.i +} + +define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 { +; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtns_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtns_f321.i +} + +define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 { +; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtns_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtns_f321.i +} + +define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 { +; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtns_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtns_f641.i +} + +define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 { +; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtnu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtnu_f321.i +} + +define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 { +; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtnu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtnu_f321.i +} + +define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 { +; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtnu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtnu_f641.i +} + +define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 { +; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtps_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtps_f321.i +} + +define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 { +; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtps_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtps_f321.i +} + +define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 { +; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtps_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtps_f641.i +} + +define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 { +; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtpu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtpu_f321.i +} + +define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 { +; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtpu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtpu_f321.i +} + +define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 { +; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtpu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtpu_f641.i +} + +define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 { +; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtms_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtms_f321.i +} + +define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 { +; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtms_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtms_f321.i +} + +define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 { +; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtms_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtms_f641.i +} + +define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 { +; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtmu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtmu_f321.i +} + +define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 { +; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtmu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtmu_f321.i +} + +define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 { +; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtmu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtmu_f641.i +} + +define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 { +; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtas_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtas_f321.i +} + +define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 { +; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtas_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtas_f321.i +} + +define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 { +; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtas_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtas_f641.i +} + +define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 { +; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvtau_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) #4 + ret <2 x i32> %vcvtau_f321.i +} + +define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 { +; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvtau_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) #4 + ret <4 x i32> %vcvtau_f321.i +} + +define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 { +; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvtau_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) #4 + ret <2 x i64> %vcvtau_f641.i +} + +define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 { +; CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrsqrte1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrsqrte1.i +} + +define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 { +; CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrsqrte1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrsqrte1.i +} + +define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 { +; CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrsqrte1.i = tail call <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrsqrte1.i +} + +define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 { +; CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrecpe1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4 + ret <2 x float> %vrecpe1.i +} + +define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 { +; CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrecpe1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4 + ret <4 x float> %vrecpe1.i +} + +define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 { +; CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vrecpe1.i = tail call <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double> %a) #4 + ret <2 x double> %vrecpe1.i +} + +define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 { +; CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vrecpe1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4 + ret <2 x i32> %vrecpe1.i +} + +define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 { +; CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vrecpe1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4 + ret <4 x i32> %vrecpe1.i +} + +define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 { +; CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vsqrt1.i = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #4 + ret <2 x float> %vsqrt1.i +} + +define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 { +; CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vsqrt1.i = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #4 + ret <4 x float> %vsqrt1.i +} + +define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 { +; CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vsqrt1.i = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #4 + ret <2 x double> %vsqrt1.i +} + +define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 { +; CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvt.i = sitofp <2 x i32> %a to <2 x float> + ret <2 x float> %vcvt.i +} + +define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 { +; CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s + %vcvt.i = uitofp <2 x i32> %a to <2 x float> + ret <2 x float> %vcvt.i +} + +define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 { +; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvt.i = sitofp <4 x i32> %a to <4 x float> + ret <4 x float> %vcvt.i +} + +define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 { +; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s + %vcvt.i = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %vcvt.i +} + +define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 { +; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = sitofp <2 x i64> %a to <2 x double> + ret <2 x double> %vcvt.i +} + +define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 { +; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d + %vcvt.i = uitofp <2 x i64> %a to <2 x double> + ret <2 x double> %vcvt.i +} + +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2 + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2 + +declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #2 + +declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) #2 + +declare <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double>) #2 + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) #2 + +declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) #2 + +declare <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double>) #2 + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2 + +declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) #2 + +declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) #2 + +declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) #2 + +declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) #2 + +declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3 + +declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #3 + +declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #3 + +declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #3 + +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3 + +declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #3 + +declare <2 x double> @llvm.rint.v2f64(<2 x double>) #3 + +declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3 + +declare <2 x float> @llvm.rint.v2f32(<2 x float>) #3 + +declare <2 x double> @llvm.floor.v2f64(<2 x double>) #3 + +declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3 + +declare <2 x float> @llvm.floor.v2f32(<2 x float>) #3 + +declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #3 + +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3 + +declare <2 x float> @llvm.ceil.v2f32(<2 x float>) #3 + +declare <2 x double> @llvm.round.v2f64(<2 x double>) #3 + +declare <4 x float> @llvm.round.v4f32(<4 x float>) #3 + +declare <2 x float> @llvm.round.v2f32(<2 x float>) #3 + +declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) #2 + +declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2 + +declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2 + +declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) #2 + +declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2 + +declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) #2 + +declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) #2 + +declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) #2 + +declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) #2 + +declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) #2 + +declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) #2 + +declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) #2 + +declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) #2 + +declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) #2 + +declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #2 + +declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) #2 + +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #2 + +declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) #2 + +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #2 + +declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) #2 + +declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) #2 + +declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) #2 + +declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) #2 + +declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) #2 + +declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) #2 + +declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) #2 + +declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) #2 + +declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) #2 + +declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) #2 + +declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) #2 + +declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) #2 + +declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) #2 + +declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) #2 + +declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) #2 + +declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) #2 + +declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #3 + +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #3 + +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #3 + +declare <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64>) #2 + +declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) #2 + +declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) #2 + +declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) #2 + +declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) #2 + +declare <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64>) #2 + +declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) #2 + +declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) #2 + +declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) #2 + +declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) #2 + +declare <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64>) #2 + +declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) #2 + +declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) #2 + +declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) #2 + +declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) #2 + +declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) #2 + +declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) #2 + +declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) #2 + +declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) #2 + +declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) #2 + +declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) #2 + +declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) #2 + +declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) #2 + +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) #2 + +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) #2 + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) #2 + +declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) #2 + +declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) #2 + +declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) #2 + +declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) #2 + +declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) #2 + +declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) #2 + +declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) #2 + +declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) #2 + + +define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvt_s64_f64 +; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}} + %1 = fptosi <1 x double> %a to <1 x i64> + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvt_u64_f64 +; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}} + %1 = fptoui <1 x double> %a to <1 x i64> + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvtn_s64_f64 +; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvtn_u64_f64 +; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvtp_s64_f64 +; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvtp_u64_f64 +; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvtm_s64_f64 +; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvtm_u64_f64 +; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvta_s64_f64 +; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvta_u64_f64 +; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a) + ret <1 x i64> %1 +} + +define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) { +; CHECK-LABEL: test_vcvt_f64_s64 +; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}} + %1 = sitofp <1 x i64> %a to <1 x double> + ret <1 x double> %1 +} + +define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) { +; CHECK-LABEL: test_vcvt_f64_u64 +; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}} + %1 = uitofp <1 x i64> %a to <1 x double> + ret <1 x double> %1 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>) +declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>) + +define <1 x double> @test_vrndn_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrndn_f64 +; CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrnda_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrnda_f64 +; CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.round.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrndp_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrndp_f64 +; CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.ceil.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrndm_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrndm_f64 +; CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.floor.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrndx_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrndx_f64 +; CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.rint.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrnd_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrnd_f64 +; CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.trunc.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrndi_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrndi_f64 +; CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>) +declare <1 x double> @llvm.trunc.v1f64(<1 x double>) +declare <1 x double> @llvm.rint.v1f64(<1 x double>) +declare <1 x double> @llvm.floor.v1f64(<1 x double>) +declare <1 x double> @llvm.ceil.v1f64(<1 x double>) +declare <1 x double> @llvm.round.v1f64(<1 x double>) +declare <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double>) + +define <1 x double> @test_vrsqrte_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrsqrte_f64 +; CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrecpe_f64(<1 x double> %a) { +; CHECK-LABEL: test_vrecpe_f64 +; CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vsqrt_f64(<1 x double> %a) { +; CHECK-LABEL: test_vsqrt_f64 +; CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a) + ret <1 x double> %1 +} + +define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vrecps_f64 +; CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %1 +} + +define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) { +; CHECK-LABEL: test_vrsqrts_f64 +; CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + %1 = tail call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %a, <1 x double> %b) + ret <1 x double> %1 +} + +declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>) +declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>) +declare <1 x double> @llvm.sqrt.v1f64(<1 x double>) +declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>) +declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll new file mode 100644 index 0000000000000..23e9223a8b7b7 --- /dev/null +++ b/test/CodeGen/AArch64/neon-mla-mls.ll @@ -0,0 +1,88 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +;CHECK: mla {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = mul <8 x i8> %A, %B; + %tmp2 = add <8 x i8> %C, %tmp1; + ret <8 x i8> %tmp2 +} + +define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +;CHECK: mla {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = mul <16 x i8> %A, %B; + %tmp2 = add <16 x i8> %C, %tmp1; + ret <16 x i8> %tmp2 +} + +define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +;CHECK: mla {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp1 = mul <4 x i16> %A, %B; + %tmp2 = add <4 x i16> %C, %tmp1; + ret <4 x i16> %tmp2 +} + +define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +;CHECK: mla {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp1 = mul <8 x i16> %A, %B; + %tmp2 = add <8 x i16> %C, %tmp1; + ret <8 x i16> %tmp2 +} + +define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +;CHECK: mla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = mul <2 x i32> %A, %B; + %tmp2 = add <2 x i32> %C, %tmp1; + ret <2 x i32> %tmp2 +} + +define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +;CHECK: mla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = mul <4 x i32> %A, %B; + %tmp2 = add <4 x i32> %C, %tmp1; + ret <4 x i32> %tmp2 +} + +define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +;CHECK: mls {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = mul <8 x i8> %A, %B; + %tmp2 = sub <8 x i8> %C, %tmp1; + ret <8 x i8> %tmp2 +} + +define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +;CHECK: mls {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = mul <16 x i8> %A, %B; + %tmp2 = sub <16 x i8> %C, %tmp1; + ret <16 x i8> %tmp2 +} + +define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +;CHECK: mls {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp1 = mul <4 x i16> %A, %B; + %tmp2 = sub <4 x i16> %C, %tmp1; + ret <4 x i16> %tmp2 +} + +define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +;CHECK: mls {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp1 = mul <8 x i16> %A, %B; + %tmp2 = sub <8 x i16> %C, %tmp1; + ret <8 x i16> %tmp2 +} + +define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +;CHECK: mls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = mul <2 x i32> %A, %B; + %tmp2 = sub <2 x i32> %C, %tmp1; + ret <2 x i32> %tmp2 +} + +define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +;CHECK: mls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = mul <4 x i32> %A, %B; + %tmp2 = sub <4 x i32> %C, %tmp1; + ret <4 x i32> %tmp2 +} + + diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll new file mode 100644 index 0000000000000..60b13b8b9a0ec --- /dev/null +++ b/test/CodeGen/AArch64/neon-mov.ll @@ -0,0 +1,217 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @movi8b() { +;CHECK: movi {{v[0-31]+}}.8b, #0x8 + ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <16 x i8> @movi16b() { +;CHECK: movi {{v[0-31]+}}.16b, #0x8 + ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <2 x i32> @movi2s_lsl0() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff + ret <2 x i32> < i32 255, i32 255 > +} + +define <2 x i32> @movi2s_lsl8() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #8 + ret <2 x i32> < i32 65280, i32 65280 > +} + +define <2 x i32> @movi2s_lsl16() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #16 + ret <2 x i32> < i32 16711680, i32 16711680 > + +} + +define <2 x i32> @movi2s_lsl24() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #24 + ret <2 x i32> < i32 4278190080, i32 4278190080 > +} + +define <4 x i32> @movi4s_lsl0() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff + ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 > +} + +define <4 x i32> @movi4s_lsl8() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #8 + ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 > +} + +define <4 x i32> @movi4s_lsl16() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #16 + ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 > + +} + +define <4 x i32> @movi4s_lsl24() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #24 + ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 > +} + +define <4 x i16> @movi4h_lsl0() { +;CHECK: movi {{v[0-31]+}}.4h, #0xff + ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 > +} + +define <4 x i16> @movi4h_lsl8() { +;CHECK: movi {{v[0-31]+}}.4h, #0xff, lsl #8 + ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 > +} + +define <8 x i16> @movi8h_lsl0() { +;CHECK: movi {{v[0-31]+}}.8h, #0xff + ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > +} + +define <8 x i16> @movi8h_lsl8() { +;CHECK: movi {{v[0-31]+}}.8h, #0xff, lsl #8 + ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > +} + + +define <2 x i32> @mvni2s_lsl0() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10 + ret <2 x i32> < i32 4294967279, i32 4294967279 > +} + +define <2 x i32> @mvni2s_lsl8() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #8 + ret <2 x i32> < i32 4294963199, i32 4294963199 > +} + +define <2 x i32> @mvni2s_lsl16() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #16 + ret <2 x i32> < i32 4293918719, i32 4293918719 > +} + +define <2 x i32> @mvni2s_lsl24() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #24 + ret <2 x i32> < i32 4026531839, i32 4026531839 > +} + +define <4 x i32> @mvni4s_lsl0() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10 + ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > +} + +define <4 x i32> @mvni4s_lsl8() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #8 + ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 > +} + +define <4 x i32> @mvni4s_lsl16() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #16 + ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 > + +} + +define <4 x i32> @mvni4s_lsl24() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #24 + ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 > +} + + +define <4 x i16> @mvni4h_lsl0() { +;CHECK: mvni {{v[0-31]+}}.4h, #0x10 + ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 > +} + +define <4 x i16> @mvni4h_lsl8() { +;CHECK: mvni {{v[0-31]+}}.4h, #0x10, lsl #8 + ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 > +} + +define <8 x i16> @mvni8h_lsl0() { +;CHECK: mvni {{v[0-31]+}}.8h, #0x10 + ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 > +} + +define <8 x i16> @mvni8h_lsl8() { +;CHECK: mvni {{v[0-31]+}}.8h, #0x10, lsl #8 + ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 > +} + + +define <2 x i32> @movi2s_msl8(<2 x i32> %a) { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #8 + ret <2 x i32> < i32 65535, i32 65535 > +} + +define <2 x i32> @movi2s_msl16() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #16 + ret <2 x i32> < i32 16777215, i32 16777215 > +} + + +define <4 x i32> @movi4s_msl8() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #8 + ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 > +} + +define <4 x i32> @movi4s_msl16() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #16 + ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 > +} + +define <2 x i32> @mvni2s_msl8() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #8 + ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264> +} + +define <2 x i32> @mvni2s_msl16() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #16 + ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504> +} + +define <4 x i32> @mvni4s_msl8() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #8 + ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264> +} + +define <4 x i32> @mvni4s_msl16() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #16 + ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504> +} + +define <2 x i64> @movi2d() { +;CHECK: movi {{v[0-31]+}}.2d, #0xff0000ff0000ffff + ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > +} + +define <1 x i64> @movid() { +;CHECK: movi {{d[0-31]+}}, #0xff0000ff0000ffff + ret <1 x i64> < i64 18374687574888349695 > +} + +define <2 x float> @fmov2s() { +;CHECK: fmov {{v[0-31]+}}.2s, #-12.00000000 + ret <2 x float> < float -1.2e1, float -1.2e1> +} + +define <4 x float> @fmov4s() { +;CHECK: fmov {{v[0-31]+}}.4s, #-12.00000000 + ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1> +} + +define <2 x double> @fmov2d() { +;CHECK: fmov {{v[0-31]+}}.2d, #-12.00000000 + ret <2 x double> < double -1.2e1, double -1.2e1> +} + +define <2 x i32> @movi1d_1() { +; CHECK: movi d0, #0xffffffff0000 + ret <2 x i32> < i32 -65536, i32 65535> +} + + +declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>) +define <2 x i32> @movi1d() { +; CHECK: movi d1, #0xffffffff0000 + %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>) + ret <2 x i32> %1 +} + diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll new file mode 100644 index 0000000000000..e1be31326638d --- /dev/null +++ b/test/CodeGen/AArch64/neon-mul-div.ll @@ -0,0 +1,181 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: mul {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = mul <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: mul {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = mul <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: mul {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = mul <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: mul {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = mul <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: mul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = mul <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: mul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = mul <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + + define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fmul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fmul <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fmul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fmul <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fmul {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fmul <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + + + define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fdiv {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fdiv <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fdiv {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fdiv <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fdiv {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fdiv <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) +declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) + +define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: poly_mulv8i8: + %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: pmul v0.8b, v0.8b, v1.8b + ret <8 x i8> %prod +} + +define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: poly_mulv16i8: + %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: pmul v0.16b, v0.16b, v1.16b + ret <16 x i8> %prod +} + +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqdmulh v0.4h, v0.4h, v1.4h + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqdmulh v0.8h, v0.8h, v1.8h + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqdmulh v0.2s, v0.2s, v1.2s + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqdmulh v0.4s, v0.4s, v1.4s + ret <4 x i32> %prod +} + +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqrdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqrdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqrdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqrdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s + ret <4 x i32> %prod +} + +declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll new file mode 100644 index 0000000000000..fa4d54dc745f6 --- /dev/null +++ b/test/CodeGen/AArch64/neon-perm.ll @@ -0,0 +1,1693 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +%struct.int8x8x2_t = type { [2 x <8 x i8>] } +%struct.int16x4x2_t = type { [2 x <4 x i16>] } +%struct.int32x2x2_t = type { [2 x <2 x i32>] } +%struct.uint8x8x2_t = type { [2 x <8 x i8>] } +%struct.uint16x4x2_t = type { [2 x <4 x i16>] } +%struct.uint32x2x2_t = type { [2 x <2 x i32>] } +%struct.float32x2x2_t = type { [2 x <2 x float>] } +%struct.poly8x8x2_t = type { [2 x <8 x i8>] } +%struct.poly16x4x2_t = type { [2 x <4 x i16>] } +%struct.int8x16x2_t = type { [2 x <16 x i8>] } +%struct.int16x8x2_t = type { [2 x <8 x i16>] } +%struct.int32x4x2_t = type { [2 x <4 x i32>] } +%struct.uint8x16x2_t = type { [2 x <16 x i8>] } +%struct.uint16x8x2_t = type { [2 x <8 x i16>] } +%struct.uint32x4x2_t = type { [2 x <4 x i32>] } +%struct.float32x4x2_t = type { [2 x <4 x float>] } +%struct.poly8x16x2_t = type { [2 x <16 x i8>] } +%struct.poly16x8x2_t = type { [2 x <8 x i16>] } + +define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp1_s8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzp1q_s8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp1_s16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzp1q_s16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vuzp1_s32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vuzp1q_s32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vuzp1q_s64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i64> %shuffle.i +} + +define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp1_u8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzp1q_u8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp1_u16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzp1q_u16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vuzp1_u32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vuzp1q_u32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vuzp1q_u64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i64> %shuffle.i +} + +define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vuzp1_f32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2> + ret <2 x float> %shuffle.i +} + +define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vuzp1q_f32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) { +; CHECK: test_vuzp1q_f64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2> + ret <2 x double> %shuffle.i +} + +define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp1_p8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzp1q_p8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp1_p16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzp1q_p16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp2_s8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzp2q_s8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp2_s16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzp2q_s16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vuzp2_s32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vuzp2q_s32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vuzp2q_s64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle.i +} + +define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp2_u8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzp2q_u8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp2_u16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzp2q_u16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vuzp2_u32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vuzp2q_u32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vuzp2q_u64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle.i +} + +define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vuzp2_f32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3> + ret <2 x float> %shuffle.i +} + +define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vuzp2q_f32: +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) { +; CHECK: test_vuzp2q_f64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3> + ret <2 x double> %shuffle.i +} + +define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp2_p8: +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzp2q_p8: +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp2_p16: +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzp2q_p16: +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip1_s8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzip1q_s8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip1_s16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzip1q_s16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vzip1_s32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vzip1q_s32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vzip1q_s64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i64> %shuffle.i +} + +define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip1_u8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzip1q_u8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip1_u16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzip1q_u16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vzip1_u32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vzip1q_u32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vzip1q_u64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i64> %shuffle.i +} + +define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vzip1_f32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2> + ret <2 x float> %shuffle.i +} + +define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vzip1q_f32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) { +; CHECK: test_vzip1q_f64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2> + ret <2 x double> %shuffle.i +} + +define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip1_p8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzip1q_p8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip1_p16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzip1q_p16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip2_s8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzip2q_s8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip2_s16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzip2q_s16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vzip2_s32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vzip2q_s32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vzip2q_s64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle.i +} + +define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip2_u8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzip2q_u8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip2_u16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzip2q_u16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vzip2_u32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vzip2q_u32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vzip2q_u64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle.i +} + +define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vzip2_f32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3> + ret <2 x float> %shuffle.i +} + +define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vzip2q_f32: +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) { +; CHECK: test_vzip2q_f64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3> + ret <2 x double> %shuffle.i +} + +define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip2_p8: +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzip2q_p8: +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip2_p16: +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzip2q_p16: +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn1_s8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrn1q_s8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn1_s16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrn1q_s16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vtrn1_s32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vtrn1q_s32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vtrn1q_s64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i64> %shuffle.i +} + +define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn1_u8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrn1q_u8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn1_u16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrn1q_u16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vtrn1_u32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vtrn1q_u32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vtrn1q_u64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> + ret <2 x i64> %shuffle.i +} + +define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vtrn1_f32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +entry: + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2> + ret <2 x float> %shuffle.i +} + +define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vtrn1q_f32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) { +; CHECK: test_vtrn1q_f64: +; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2> + ret <2 x double> %shuffle.i +} + +define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn1_p8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrn1q_p8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn1_p16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrn1q_p16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x i16> %shuffle.i +} + +define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn2_s8: +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrn2q_s8: +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn2_s16: +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrn2q_s16: +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vtrn2_s32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vtrn2q_s32: +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vtrn2q_s64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle.i +} + +define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn2_u8: +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrn2q_u8: +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn2_u16: +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrn2q_u16: +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i16> %shuffle.i +} + +define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vtrn2_u32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i32> %shuffle.i +} + +define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vtrn2q_u32: +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x i32> %shuffle.i +} + +define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vtrn2q_u64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle.i +} + +define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vtrn2_f32: +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3> + ret <2 x float> %shuffle.i +} + +define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vtrn2q_f32: +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) { +; CHECK: test_vtrn2q_f64: +; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3> + ret <2 x double> %shuffle.i +} + +define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn2_p8: +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i8> %shuffle.i +} + +define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrn2q_p8: +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31> + ret <16 x i8> %shuffle.i +} + +define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn2_p16: +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x i16> %shuffle.i +} + +define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrn2q_p16: +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i16> %shuffle.i +} + +define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp_s8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp_s16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vuzp_s32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vuzp1.i, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp_u8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 + ret %struct.uint8x8x2_t %.fca.0.1.insert +} + +define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp_u16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1 + ret %struct.uint16x4x2_t %.fca.0.1.insert +} + +define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vuzp_u32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vuzp1.i, 0, 1 + ret %struct.uint32x2x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vuzp_f32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2> + %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vuzp1.i, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vuzp_p8: +; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 + ret %struct.poly8x8x2_t %.fca.0.1.insert +} + +define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vuzp_p16: +; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1 + ret %struct.poly16x4x2_t %.fca.0.1.insert +} + +define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzpq_s8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzpq_s16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vuzpq_s32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vuzp1.i, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzpq_u8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1 + ret %struct.uint8x16x2_t %.fca.0.1.insert +} + +define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzpq_u16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1 + ret %struct.uint16x8x2_t %.fca.0.1.insert +} + +define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vuzpq_u32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vuzp1.i, 0, 1 + ret %struct.uint32x4x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vuzpq_f32: +; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vuzp.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %vuzp1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vuzp1.i, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vuzpq_p8: +; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1 + ret %struct.poly8x16x2_t %.fca.0.1.insert +} + +define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vuzpq_p16: +; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1 + ret %struct.poly16x8x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip_s8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip_s16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vzip_s32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vzip1.i, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip_u8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1 + ret %struct.uint8x8x2_t %.fca.0.1.insert +} + +define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip_u16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1 + ret %struct.uint16x4x2_t %.fca.0.1.insert +} + +define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vzip_u32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vzip1.i, 0, 1 + ret %struct.uint32x2x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vzip_f32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2> + %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vzip1.i, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vzip_p8: +; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1 + ret %struct.poly8x8x2_t %.fca.0.1.insert +} + +define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vzip_p16: +; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1 + ret %struct.poly16x4x2_t %.fca.0.1.insert +} + +define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzipq_s8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> + %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzipq_s16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vzipq_s32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vzip1.i, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzipq_u8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> + %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1 + ret %struct.uint8x16x2_t %.fca.0.1.insert +} + +define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzipq_u16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1 + ret %struct.uint16x8x2_t %.fca.0.1.insert +} + +define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vzipq_u32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vzip1.i, 0, 1 + ret %struct.uint32x4x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vzipq_f32: +; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vzip.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %vzip1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vzip1.i, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vzipq_p8: +; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> + %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1 + ret %struct.poly8x16x2_t %.fca.0.1.insert +} + +define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vzipq_p16: +; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vzip.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1 + ret %struct.poly16x8x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn_s8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn_s16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vtrn_s32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vtrn1.i, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn_u8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1 + ret %struct.uint8x8x2_t %.fca.0.1.insert +} + +define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn_u16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1 + ret %struct.uint16x4x2_t %.fca.0.1.insert +} + +define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vtrn_u32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2> + %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vtrn1.i, 0, 1 + ret %struct.uint32x2x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) { +; CHECK: test_vtrn_f32: +; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0] +; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1] +entry: + %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2> + %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3> + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vtrn1.i, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtrn_p8: +; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +entry: + %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1 + ret %struct.poly8x8x2_t %.fca.0.1.insert +} + +define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vtrn_p16: +; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +entry: + %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1 + ret %struct.poly16x4x2_t %.fca.0.1.insert +} + +define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrnq_s8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30> + %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31> + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrnq_s16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vtrnq_s32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vtrn1.i, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrnq_u8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30> + %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31> + %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1 + ret %struct.uint8x16x2_t %.fca.0.1.insert +} + +define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrnq_u16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1 + ret %struct.uint16x8x2_t %.fca.0.1.insert +} + +define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vtrnq_u32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vtrn1.i, 0, 1 + ret %struct.uint32x4x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) { +; CHECK: test_vtrnq_f32: +; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +entry: + %vtrn.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + %vtrn1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vtrn1.i, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vtrnq_p8: +; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +entry: + %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30> + %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31> + %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1 + ret %struct.poly8x16x2_t %.fca.0.1.insert +} + +define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vtrnq_p16: +; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +entry: + %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1 + ret %struct.poly16x8x2_t %.fca.0.1.insert +} + +define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) { +; CHECK: test_uzp: + + %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 + ret %struct.uint8x8x2_t %.fca.0.1.insert + +; CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK-NEXT: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK-NEXT: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +} diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll new file mode 100644 index 0000000000000..009da3b51a830 --- /dev/null +++ b/test/CodeGen/AArch64/neon-rounding-halving-add.ll @@ -0,0 +1,105 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_urhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: urhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_srhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: srhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_urhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: urhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_srhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: srhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_urhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: urhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_srhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: srhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_urhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: urhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_srhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: srhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_urhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: urhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_srhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: srhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_urhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: urhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_srhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: srhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll new file mode 100644 index 0000000000000..5b4ec2862c792 --- /dev/null +++ b/test/CodeGen/AArch64/neon-rounding-shift.ll @@ -0,0 +1,121 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_urshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: urshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_srshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: srshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_urshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: urshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_srshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: srshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_urshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: urshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_srshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: srshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_urshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: urshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_srshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: srshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_urshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: urshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_srshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: srshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_urshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: urshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_srshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: srshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_urshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: urshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_srshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: srshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll new file mode 100644 index 0000000000000..fc60d900e4db8 --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-add-sub.ll @@ -0,0 +1,241 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + + +declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqadd_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqadd v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqadd_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqadd v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqsub_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqsub v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqsub_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqsub v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll new file mode 100644 index 0000000000000..d89262c2abaaf --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll @@ -0,0 +1,121 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqrshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqrshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqrshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqrshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqrshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqrshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqrshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqrshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqrshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqrshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqrshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqrshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqrshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqrshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqrshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqrshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqrshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqrshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqrshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqrshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqrshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqrshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqrshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqrshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqrshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqrshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqrshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqrshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll new file mode 100644 index 0000000000000..11009fba75112 --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-shift.ll @@ -0,0 +1,121 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-scalar-abs.ll b/test/CodeGen/AArch64/neon-scalar-abs.ll new file mode 100644 index 0000000000000..03a89e043e50a --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-abs.ll @@ -0,0 +1,61 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define i64 @test_vabsd_s64(i64 %a) { +; CHECK: test_vabsd_s64 +; CHECK: abs {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vabs.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vabs1.i = tail call <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64> %vabs.i) + %0 = extractelement <1 x i64> %vabs1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64>) + +define i8 @test_vqabsb_s8(i8 %a) { +; CHECK: test_vqabsb_s8 +; CHECK: sqabs {{b[0-9]+}}, {{b[0-9]+}} +entry: + %vqabs.i = insertelement <1 x i8> undef, i8 %a, i32 0 + %vqabs1.i = call <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8> %vqabs.i) + %0 = extractelement <1 x i8> %vqabs1.i, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8>) + +define i16 @test_vqabsh_s16(i16 %a) { +; CHECK: test_vqabsh_s16 +; CHECK: sqabs {{h[0-9]+}}, {{h[0-9]+}} +entry: + %vqabs.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vqabs1.i = call <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16> %vqabs.i) + %0 = extractelement <1 x i16> %vqabs1.i, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16>) + +define i32 @test_vqabss_s32(i32 %a) { +; CHECK: test_vqabss_s32 +; CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vqabs.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqabs1.i = call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %vqabs.i) + %0 = extractelement <1 x i32> %vqabs1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>) + +define i64 @test_vqabsd_s64(i64 %a) { +; CHECK: test_vqabsd_s64 +; CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vqabs.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vqabs1.i = call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %vqabs.i) + %0 = extractelement <1 x i64> %vqabs1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll new file mode 100644 index 0000000000000..09ca880c80537 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-add-sub.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + %tmp3 = add <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + %tmp3 = sub <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +declare <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_add_v1i64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uadd_v1i64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +declare <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sub_v1i64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_usub_v1i64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + + + diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll new file mode 100644 index 0000000000000..8ce42def409a1 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll @@ -0,0 +1,108 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +declare float @llvm.fma.f32(float, float, float) +declare double @llvm.fma.f64(double, double, double) + +define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) { + ; CHECK: test_fmla_ss4S + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) + ret float %tmp2 +} + +define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) { + ; CHECK: test_fmla_ss4S_swap + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a) + ret float %tmp2 +} + +define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) { + ; CHECK: test_fmla_ss2S + ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a) + ret float %tmp2 +} + +define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) { + ; CHECK: test_fmla_ddD + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a) + ret double %tmp2 +} + +define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) { + ; CHECK: test_fmla_dd2D + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a) + ret double %tmp2 +} + +define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) { + ; CHECK: test_fmla_dd2D_swap + ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a) + ret double %tmp2 +} + +define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) { + ; CHECK: test_fmls_ss4S + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fsub float -0.0, %tmp1 + %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a) + ret float %tmp3 +} + +define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) { + ; CHECK: test_fmls_ss4S_swap + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fsub float -0.0, %tmp1 + %tmp3 = call float @llvm.fma.f32(float %tmp1, float %tmp2, float %a) + ret float %tmp3 +} + + +define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) { + ; CHECK: test_fmls_ss2S + ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = fsub float -0.0, %tmp1 + %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a) + ret float %tmp3 +} + +define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) { + ; CHECK: test_fmls_ddD + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = fsub double -0.0, %tmp1 + %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a) + ret double %tmp3 +} + +define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) { + ; CHECK: test_fmls_dd2D + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fsub double -0.0, %tmp1 + %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a) + ret double %tmp3 +} + +define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) { + ; CHECK: test_fmls_dd2D_swap + ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fsub double -0.0, %tmp1 + %tmp3 = call double @llvm.fma.f64(double %tmp1, double %tmp2, double %a) + ret double %tmp3 +} + diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll new file mode 100644 index 0000000000000..968ad3e8cf71e --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll @@ -0,0 +1,124 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) { + ; CHECK: test_fmul_lane_ss2S + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = fmul float %a, %tmp1; + ret float %tmp2; +} + +define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) { + ; CHECK: test_fmul_lane_ss2S_swap + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = fmul float %tmp1, %a; + ret float %tmp2; +} + + +define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) { + ; CHECK: test_fmul_lane_ss4S + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fmul float %a, %tmp1; + ret float %tmp2; +} + +define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) { + ; CHECK: test_fmul_lane_ss4S_swap + ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = fmul float %tmp1, %a; + ret float %tmp2; +} + + +define double @test_fmul_lane_ddD(double %a, <1 x double> %v) { + ; CHECK: test_fmul_lane_ddD + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = fmul double %a, %tmp1; + ret double %tmp2; +} + + + +define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) { + ; CHECK: test_fmul_lane_dd2D + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fmul double %a, %tmp1; + ret double %tmp2; +} + + +define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) { + ; CHECK: test_fmul_lane_dd2D_swap + ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = fmul double %tmp1, %a; + ret double %tmp2; +} + +declare float @llvm.aarch64.neon.vmulx.f32(float, float) + +define float @test_fmulx_lane_f32(float %a, <2 x float> %v) { + ; CHECK: test_fmulx_lane_f32 + ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1) + ret float %tmp2; +} + +define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) { + ; CHECK: test_fmulx_laneq_f32 + ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1) + ret float %tmp2; +} + +define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) { + ; CHECK: test_fmulx_laneq_f32_swap + ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %tmp1 = extractelement <4 x float> %v, i32 3 + %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a) + ret float %tmp2; +} + +declare double @llvm.aarch64.neon.vmulx.f64(double, double) + +define double @test_fmulx_lane_f64(double %a, <1 x double> %v) { + ; CHECK: test_fmulx_lane_f64 + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <1 x double> %v, i32 0 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) + ret double %tmp2; +} + +define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) { + ; CHECK: test_fmulx_laneq_f64_0 + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0] + %tmp1 = extractelement <2 x double> %v, i32 0 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) + ret double %tmp2; +} + + +define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) { + ; CHECK: test_fmulx_laneq_f64_1 + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1) + ret double %tmp2; +} + +define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) { + ; CHECK: test_fmulx_laneq_f64_1_swap + ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a) + ret double %tmp2; +} + diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll new file mode 100644 index 0000000000000..5f10cbbab2a65 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-compare.ll @@ -0,0 +1,343 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +;; Scalar Integer Compare + +define i64 @test_vceqd(i64 %a, i64 %b) { +; CHECK: test_vceqd +; CHECK: cmeq {{d[0-9]+}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vceq.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vceq1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceq.i, <1 x i64> %vceq1.i) + %0 = extractelement <1 x i64> %vceq2.i, i32 0 + ret i64 %0 +} + +define i64 @test_vceqzd(i64 %a) { +; CHECK: test_vceqzd +; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0 +entry: + %vceqz.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vceqz1.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceqz.i, <1 x i64> zeroinitializer) + %0 = extractelement <1 x i64> %vceqz1.i, i32 0 + ret i64 %0 +} + +define i64 @test_vcged(i64 %a, i64 %b) { +; CHECK: test_vcged +; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcge.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vcge1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i) + %0 = extractelement <1 x i64> %vcge2.i, i32 0 + ret i64 %0 +} + +define i64 @test_vcgezd(i64 %a) { +; CHECK: test_vcgezd +; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0 +entry: + %vcgez.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vcgez1.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcgez.i, <1 x i64> zeroinitializer) + %0 = extractelement <1 x i64> %vcgez1.i, i32 0 + ret i64 %0 +} + +define i64 @test_vcgtd(i64 %a, i64 %b) { +; CHECK: test_vcgtd +; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcgt.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vcgt1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i) + %0 = extractelement <1 x i64> %vcgt2.i, i32 0 + ret i64 %0 +} + +define i64 @test_vcgtzd(i64 %a) { +; CHECK: test_vcgtzd +; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0 +entry: + %vcgtz.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vcgtz1.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgtz.i, <1 x i64> zeroinitializer) + %0 = extractelement <1 x i64> %vcgtz1.i, i32 0 + ret i64 %0 +} + +define i64 @test_vcled(i64 %a, i64 %b) { +; CHECK: test_vcled +; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcgt.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vcgt1.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i) + %0 = extractelement <1 x i64> %vcgt2.i, i32 0 + ret i64 %0 +} + +define i64 @test_vclezd(i64 %a) { +; CHECK: test_vclezd +; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0 +entry: + %vclez.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vclez1.i = call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64> %vclez.i, <1 x i64> zeroinitializer) + %0 = extractelement <1 x i64> %vclez1.i, i32 0 + ret i64 %0 +} + +define i64 @test_vcltd(i64 %a, i64 %b) { +; CHECK: test_vcltd +; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcge.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vcge1.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i) + %0 = extractelement <1 x i64> %vcge2.i, i32 0 + ret i64 %0 +} + +define i64 @test_vcltzd(i64 %a) { +; CHECK: test_vcltzd +; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0x0 +entry: + %vcltz.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vcltz1.i = call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64> %vcltz.i, <1 x i64> zeroinitializer) + %0 = extractelement <1 x i64> %vcltz1.i, i32 0 + ret i64 %0 +} + +define i64 @test_vtstd(i64 %a, i64 %b) { +; CHECK: test_vtstd +; CHECK: cmtst {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vtst.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vtst1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vtst2.i = call <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64> %vtst.i, <1 x i64> %vtst1.i) + %0 = extractelement <1 x i64> %vtst2.i, i32 0 + ret i64 %0 +} + + +define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vcage_f64 +; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2 + ret <1 x i64> %vcage2.i +} + +define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vcagt_f64 +; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2 + ret <1 x i64> %vcagt2.i +} + +define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vcale_f64 +; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2 + ret <1 x i64> %vcage2.i +} + +define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vcalt_f64 +; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2 + ret <1 x i64> %vcagt2.i +} + +define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vceq_s64 +; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp eq <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vceq_u64 +; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp eq <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vceq_f64 +; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = fcmp oeq <1 x double> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vcge_s64 +; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp sge <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vcge_u64 +; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp uge <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vcge_f64 +; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = fcmp oge <1 x double> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vcle_s64 +; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp sle <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vcle_u64 +; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp ule <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vcle_f64 +; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = fcmp ole <1 x double> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vcgt_s64 +; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp sgt <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vcgt_u64 +; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp ugt <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vcgt_f64 +; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = fcmp ogt <1 x double> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vclt_s64 +; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp slt <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 { +; CHECK: test_vclt_u64 +; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = icmp ult <1 x i64> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 { +; CHECK: test_vclt_f64 +; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} + %cmp.i = fcmp olt <1 x double> %a, %b + %sext.i = sext <1 x i1> %cmp.i to <1 x i64> + ret <1 x i64> %sext.i +} + +define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 { +; CHECK: test_vceqz_s64 +; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0 + %1 = icmp eq <1 x i64> %a, zeroinitializer + %vceqz.i = zext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vceqz.i +} + +define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 { +; CHECK: test_vceqz_u64 +; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0 + %1 = icmp eq <1 x i64> %a, zeroinitializer + %vceqz.i = zext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vceqz.i +} + +define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 { +; CHECK: test_vceqz_p64 +; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0 + %1 = icmp eq <1 x i64> %a, zeroinitializer + %vceqz.i = zext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vceqz.i +} + +define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 { +; CHECK: test_vceqzq_p64 +; CHECK: cmeq {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0 + %1 = icmp eq <2 x i64> %a, zeroinitializer + %vceqz.i = zext <2 x i1> %1 to <2 x i64> + ret <2 x i64> %vceqz.i +} + +define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 { +; CHECK: test_vcgez_s64 +; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0 + %1 = icmp sge <1 x i64> %a, zeroinitializer + %vcgez.i = zext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vcgez.i +} + +define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 { +; CHECK: test_vclez_s64 +; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0 + %1 = icmp sle <1 x i64> %a, zeroinitializer + %vclez.i = zext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vclez.i +} + +define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 { +; CHECK: test_vcgtz_s64 +; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0 + %1 = icmp sgt <1 x i64> %a, zeroinitializer + %vcgtz.i = zext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vcgtz.i +} + +define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 { +; CHECK: test_vcltz_s64 +; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0 + %1 = icmp slt <1 x i64> %a, zeroinitializer + %vcltz.i = zext <1 x i1> %1 to <1 x i64> + ret <1 x i64> %vcltz.i +} + +declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vchi.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll new file mode 100644 index 0000000000000..d433ff595d1c6 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-copy.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define float @test_dup_sv2S(<2 x float> %v) { + ;CHECK: test_dup_sv2S + ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %tmp1 = extractelement <2 x float> %v, i32 1 + ret float %tmp1 +} + +define float @test_dup_sv4S(<4 x float> %v) { + ;CHECK: test_dup_sv4S + ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[0] + %tmp1 = extractelement <4 x float> %v, i32 0 + ret float %tmp1 +} + +define double @test_dup_dvD(<1 x double> %v) { + ;CHECK: test_dup_dvD + ;CHECK-NOT: dup {{d[0-31]+}}, {{v[0-31]+}}.d[0] + ;CHECK: ret + %tmp1 = extractelement <1 x double> %v, i32 0 + ret double %tmp1 +} + +define double @test_dup_dv2D(<2 x double> %v) { + ;CHECK: test_dup_dv2D + ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %tmp1 = extractelement <2 x double> %v, i32 1 + ret double %tmp1 +} + +define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) { + ;CHECK: test_vector_dup_bv16B + ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[14] + %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> + ret <1 x i8> %shuffle.i +} + +define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) { + ;CHECK: test_vector_dup_bv8B + ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[7] + %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> + ret <1 x i8> %shuffle.i +} + +define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) { + ;CHECK: test_vector_dup_hv8H + ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[7] + %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> + ret <1 x i16> %shuffle.i +} + +define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) { + ;CHECK: test_vector_dup_hv4H + ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[3] + %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> + ret <1 x i16> %shuffle.i +} + +define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) { + ;CHECK: test_vector_dup_sv4S + ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[3] + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> + ret <1 x i32> %shuffle +} + +define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) { + ;CHECK: test_vector_dup_sv2S + ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1] + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> + ret <1 x i32> %shuffle +} + +define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) { + ;CHECK: test_vector_dup_dv2D + ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> + ret <1 x i64> %shuffle.i +} + +define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) { + ;CHECK: test_vector_copy_dup_dv2D + ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1] + %vget_lane = extractelement <2 x i64> %c, i32 1 + %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0 + ret <1 x i64> %vset_lane +} + diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll new file mode 100644 index 0000000000000..a06d5d60a85b3 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-cvt.ll @@ -0,0 +1,137 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define float @test_vcvts_f32_s32(i32 %a) { +; CHECK: test_vcvts_f32_s32 +; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %0 = call float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32> %vcvtf.i) + ret float %0 +} + +declare float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32>) + +define double @test_vcvtd_f64_s64(i64 %a) { +; CHECK: test_vcvtd_f64_s64 +; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %0 = call double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64> %vcvtf.i) + ret double %0 +} + +declare double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64>) + +define float @test_vcvts_f32_u32(i32 %a) { +; CHECK: test_vcvts_f32_u32 +; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %0 = call float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32> %vcvtf.i) + ret float %0 +} + +declare float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32>) + +define double @test_vcvtd_f64_u64(i64 %a) { +; CHECK: test_vcvtd_f64_u64 +; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %0 = call double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64> %vcvtf.i) + ret double %0 +} + +declare double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64>) + +define float @test_vcvts_n_f32_s32(i32 %a) { +; CHECK: test_vcvts_n_f32_s32 +; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1 +entry: + %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0 + %0 = call float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32> %vcvtf, i32 1) + ret float %0 +} + +declare float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32>, i32) + +define double @test_vcvtd_n_f64_s64(i64 %a) { +; CHECK: test_vcvtd_n_f64_s64 +; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1 +entry: + %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0 + %0 = call double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64> %vcvtf, i32 1) + ret double %0 +} + +declare double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64>, i32) + +define float @test_vcvts_n_f32_u32(i32 %a) { +; CHECK: test_vcvts_n_f32_u32 +; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1 +entry: + %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0 + %0 = call float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32> %vcvtf, i32 1) + ret float %0 +} + +declare float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32>, i32) + +define double @test_vcvtd_n_f64_u64(i64 %a) { +; CHECK: test_vcvtd_n_f64_u64 +; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1 +entry: + %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0 + %0 = call double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64> %vcvtf, i32 1) + ret double %0 +} + +declare double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64>, i32) + +define i32 @test_vcvts_n_s32_f32(float %a) { +; CHECK: test_vcvts_n_s32_f32 +; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1 +entry: + %fcvtzs = insertelement <1 x float> undef, float %a, i32 0 + %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float> %fcvtzs, i32 1) + %0 = extractelement <1 x i32> %fcvtzs1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float>, i32) + +define i64 @test_vcvtd_n_s64_f64(double %a) { +; CHECK: test_vcvtd_n_s64_f64 +; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1 +entry: + %fcvtzs = insertelement <1 x double> undef, double %a, i32 0 + %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double> %fcvtzs, i32 1) + %0 = extractelement <1 x i64> %fcvtzs1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double>, i32) + +define i32 @test_vcvts_n_u32_f32(float %a) { +; CHECK: test_vcvts_n_u32_f32 +; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32 +entry: + %fcvtzu = insertelement <1 x float> undef, float %a, i32 0 + %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float> %fcvtzu, i32 32) + %0 = extractelement <1 x i32> %fcvtzu1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float>, i32) + +define i64 @test_vcvtd_n_u64_f64(double %a) { +; CHECK: test_vcvtd_n_u64_f64 +; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64 +entry: + %fcvtzu = insertelement <1 x double> undef, double %a, i32 0 + %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double> %fcvtzu, i32 64) + %0 = extractelement <1 x i64> %fcvtzu1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double>, i32) diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll new file mode 100644 index 0000000000000..faf521bc889a7 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll @@ -0,0 +1,104 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define i8 @test_vqmovunh_s16(i16 %a) { +; CHECK: test_vqmovunh_s16 +; CHECK: sqxtun {{b[0-9]+}}, {{h[0-9]+}} +entry: + %vqmovun.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vqmovun1.i = call <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16> %vqmovun.i) + %0 = extractelement <1 x i8> %vqmovun1.i, i32 0 + ret i8 %0 +} + +define i16 @test_vqmovuns_s32(i32 %a) { +; CHECK: test_vqmovuns_s32 +; CHECK: sqxtun {{h[0-9]+}}, {{s[0-9]+}} +entry: + %vqmovun.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqmovun1.i = call <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32> %vqmovun.i) + %0 = extractelement <1 x i16> %vqmovun1.i, i32 0 + ret i16 %0 +} + +define i32 @test_vqmovund_s64(i64 %a) { +; CHECK: test_vqmovund_s64 +; CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}} +entry: + %vqmovun.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vqmovun1.i = call <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64> %vqmovun.i) + %0 = extractelement <1 x i32> %vqmovun1.i, i32 0 + ret i32 %0 +} + +declare <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16>) +declare <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32>) +declare <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64>) + +define i8 @test_vqmovnh_s16(i16 %a) { +; CHECK: test_vqmovnh_s16 +; CHECK: sqxtn {{b[0-9]+}}, {{h[0-9]+}} +entry: + %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16> %vqmovn.i) + %0 = extractelement <1 x i8> %vqmovn1.i, i32 0 + ret i8 %0 +} + +define i16 @test_vqmovns_s32(i32 %a) { +; CHECK: test_vqmovns_s32 +; CHECK: sqxtn {{h[0-9]+}}, {{s[0-9]+}} +entry: + %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32> %vqmovn.i) + %0 = extractelement <1 x i16> %vqmovn1.i, i32 0 + ret i16 %0 +} + +define i32 @test_vqmovnd_s64(i64 %a) { +; CHECK: test_vqmovnd_s64 +; CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}} +entry: + %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64> %vqmovn.i) + %0 = extractelement <1 x i32> %vqmovn1.i, i32 0 + ret i32 %0 +} + +declare <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16>) +declare <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32>) +declare <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64>) + +define i8 @test_vqmovnh_u16(i16 %a) { +; CHECK: test_vqmovnh_u16 +; CHECK: uqxtn {{b[0-9]+}}, {{h[0-9]+}} +entry: + %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16> %vqmovn.i) + %0 = extractelement <1 x i8> %vqmovn1.i, i32 0 + ret i8 %0 +} + + +define i16 @test_vqmovns_u32(i32 %a) { +; CHECK: test_vqmovns_u32 +; CHECK: uqxtn {{h[0-9]+}}, {{s[0-9]+}} +entry: + %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32> %vqmovn.i) + %0 = extractelement <1 x i16> %vqmovn1.i, i32 0 + ret i16 %0 +} + +define i32 @test_vqmovnd_u64(i64 %a) { +; CHECK: test_vqmovnd_u64 +; CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}} +entry: + %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64> %vqmovn.i) + %0 = extractelement <1 x i32> %vqmovn1.i, i32 0 + ret i32 %0 +} + +declare <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16>) +declare <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32>) +declare <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll new file mode 100644 index 0000000000000..75686d32064b4 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-fabd.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define float @test_vabds_f32(float %a, float %b) { +; CHECK-LABEL: test_vabds_f32 +; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vabd.i = insertelement <1 x float> undef, float %a, i32 0 + %vabd1.i = insertelement <1 x float> undef, float %b, i32 0 + %vabd2.i = call <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float> %vabd.i, <1 x float> %vabd1.i) + %0 = extractelement <1 x float> %vabd2.i, i32 0 + ret float %0 +} + +define double @test_vabdd_f64(double %a, double %b) { +; CHECK-LABEL: test_vabdd_f64 +; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vabd.i = insertelement <1 x double> undef, double %a, i32 0 + %vabd1.i = insertelement <1 x double> undef, double %b, i32 0 + %vabd2.i = call <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double> %vabd.i, <1 x double> %vabd1.i) + %0 = extractelement <1 x double> %vabd2.i, i32 0 + ret double %0 +} + +declare <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double>, <1 x double>) +declare <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float>, <1 x float>) diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll new file mode 100644 index 0000000000000..d7b84fae7375c --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-fcvt.ll @@ -0,0 +1,255 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +;; Scalar Floating-point Convert + +define float @test_vcvtxn(double %a) { +; CHECK: test_vcvtxn +; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}} +entry: + %vcvtf.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtf1.i = tail call <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double> %vcvtf.i) + %0 = extractelement <1 x float> %vcvtf1.i, i32 0 + ret float %0 +} + +declare <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double>) + +define i32 @test_vcvtass(float %a) { +; CHECK: test_vcvtass +; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtas.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtas1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float> %vcvtas.i) + %0 = extractelement <1 x i32> %vcvtas1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float>) + +define i64 @test_test_vcvtasd(double %a) { +; CHECK: test_test_vcvtasd +; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtas.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtas1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %vcvtas.i) + %0 = extractelement <1 x i64> %vcvtas1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtaus(float %a) { +; CHECK: test_vcvtaus +; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtau.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtau1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float> %vcvtau.i) + %0 = extractelement <1 x i32> %vcvtau1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtaud(double %a) { +; CHECK: test_vcvtaud +; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtau.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtau1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %vcvtau.i) + %0 = extractelement <1 x i64> %vcvtau1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtmss(float %a) { +; CHECK: test_vcvtmss +; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtms.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtms1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float> %vcvtms.i) + %0 = extractelement <1 x i32> %vcvtms1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtmd_s64_f64(double %a) { +; CHECK: test_vcvtmd_s64_f64 +; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtms.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtms1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %vcvtms.i) + %0 = extractelement <1 x i64> %vcvtms1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtmus(float %a) { +; CHECK: test_vcvtmus +; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtmu.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtmu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float> %vcvtmu.i) + %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtmud(double %a) { +; CHECK: test_vcvtmud +; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtmu.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtmu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %vcvtmu.i) + %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtnss(float %a) { +; CHECK: test_vcvtnss +; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtns.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtns1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float> %vcvtns.i) + %0 = extractelement <1 x i32> %vcvtns1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtnd_s64_f64(double %a) { +; CHECK: test_vcvtnd_s64_f64 +; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtns.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtns1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %vcvtns.i) + %0 = extractelement <1 x i64> %vcvtns1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtnus(float %a) { +; CHECK: test_vcvtnus +; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtnu.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtnu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float> %vcvtnu.i) + %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtnud(double %a) { +; CHECK: test_vcvtnud +; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtnu.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtnu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %vcvtnu.i) + %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtpss(float %a) { +; CHECK: test_vcvtpss +; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtps.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtps1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float> %vcvtps.i) + %0 = extractelement <1 x i32> %vcvtps1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtpd_s64_f64(double %a) { +; CHECK: test_vcvtpd_s64_f64 +; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtps.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtps1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %vcvtps.i) + %0 = extractelement <1 x i64> %vcvtps1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtpus(float %a) { +; CHECK: test_vcvtpus +; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtpu.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtpu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float> %vcvtpu.i) + %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtpud(double %a) { +; CHECK: test_vcvtpud +; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtpu.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtpu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %vcvtpu.i) + %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtss(float %a) { +; CHECK: test_vcvtss +; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtzs.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtzs1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float> %vcvtzs.i) + %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtd_s64_f64(double %a) { +; CHECK: test_vcvtd_s64_f64 +; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvzs.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvzs1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %vcvzs.i) + %0 = extractelement <1 x i64> %vcvzs1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double>) + +define i32 @test_vcvtus(float %a) { +; CHECK: test_vcvtus +; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}} +entry: + %vcvtzu.i = insertelement <1 x float> undef, float %a, i32 0 + %vcvtzu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float> %vcvtzu.i) + %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float>) + +define i64 @test_vcvtud(double %a) { +; CHECK: test_vcvtud +; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}} +entry: + %vcvtzu.i = insertelement <1 x double> undef, double %a, i32 0 + %vcvtzu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %vcvtzu.i) + %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double>) diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll new file mode 100644 index 0000000000000..a6e58599acdbd --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll @@ -0,0 +1,328 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +;; Scalar Floating-point Compare + +define i32 @test_vceqs_f32(float %a, float %b) { +; CHECK: test_vceqs_f32 +; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vceq.i = insertelement <1 x float> undef, float %a, i32 0 + %vceq1.i = insertelement <1 x float> undef, float %b, i32 0 + %vceq2.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> %vceq1.i) + %0 = extractelement <1 x i32> %vceq2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vceqd_f64(double %a, double %b) { +; CHECK: test_vceqd_f64 +; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vceq.i = insertelement <1 x double> undef, double %a, i32 0 + %vceq1.i = insertelement <1 x double> undef, double %b, i32 0 + %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double> %vceq.i, <1 x double> %vceq1.i) + %0 = extractelement <1 x i64> %vceq2.i, i32 0 + ret i64 %0 +} + +define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 { +; CHECK: test_vceqz_f64 +; CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, #0.0 +entry: + %0 = fcmp oeq <1 x double> %a, zeroinitializer + %vceqz.i = zext <1 x i1> %0 to <1 x i64> + ret <1 x i64> %vceqz.i +} + +define i32 @test_vceqzs_f32(float %a) { +; CHECK: test_vceqzs_f32 +; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0 +entry: + %vceq.i = insertelement <1 x float> undef, float %a, i32 0 + %vceq1.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> zeroinitializer) + %0 = extractelement <1 x i32> %vceq1.i, i32 0 + ret i32 %0 +} + +define i64 @test_vceqzd_f64(double %a) { +; CHECK: test_vceqzd_f64 +; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0 +entry: + %vceq.i = insertelement <1 x double> undef, double %a, i32 0 + %vceq1.i = tail call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double> %vceq.i, <1 x float> zeroinitializer) #5 + %0 = extractelement <1 x i64> %vceq1.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcges_f32(float %a, float %b) { +; CHECK: test_vcges_f32 +; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcge.i = insertelement <1 x float> undef, float %a, i32 0 + %vcge1.i = insertelement <1 x float> undef, float %b, i32 0 + %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i) + %0 = extractelement <1 x i32> %vcge2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcged_f64(double %a, double %b) { +; CHECK: test_vcged_f64 +; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcge.i = insertelement <1 x double> undef, double %a, i32 0 + %vcge1.i = insertelement <1 x double> undef, double %b, i32 0 + %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i) + %0 = extractelement <1 x i64> %vcge2.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcgezs_f32(float %a) { +; CHECK: test_vcgezs_f32 +; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0 +entry: + %vcge.i = insertelement <1 x float> undef, float %a, i32 0 + %vcge1.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> zeroinitializer) + %0 = extractelement <1 x i32> %vcge1.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcgezd_f64(double %a) { +; CHECK: test_vcgezd_f64 +; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0 +entry: + %vcge.i = insertelement <1 x double> undef, double %a, i32 0 + %vcge1.i = tail call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double> %vcge.i, <1 x float> zeroinitializer) #5 + %0 = extractelement <1 x i64> %vcge1.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcgts_f32(float %a, float %b) { +; CHECK: test_vcgts_f32 +; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcgt.i = insertelement <1 x float> undef, float %a, i32 0 + %vcgt1.i = insertelement <1 x float> undef, float %b, i32 0 + %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i) + %0 = extractelement <1 x i32> %vcgt2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcgtd_f64(double %a, double %b) { +; CHECK: test_vcgtd_f64 +; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcgt.i = insertelement <1 x double> undef, double %a, i32 0 + %vcgt1.i = insertelement <1 x double> undef, double %b, i32 0 + %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i) + %0 = extractelement <1 x i64> %vcgt2.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcgtzs_f32(float %a) { +; CHECK: test_vcgtzs_f32 +; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0 +entry: + %vcgt.i = insertelement <1 x float> undef, float %a, i32 0 + %vcgt1.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> zeroinitializer) + %0 = extractelement <1 x i32> %vcgt1.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcgtzd_f64(double %a) { +; CHECK: test_vcgtzd_f64 +; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0 +entry: + %vcgt.i = insertelement <1 x double> undef, double %a, i32 0 + %vcgt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double> %vcgt.i, <1 x float> zeroinitializer) #5 + %0 = extractelement <1 x i64> %vcgt1.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcles_f32(float %a, float %b) { +; CHECK: test_vcles_f32 +; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcge.i = insertelement <1 x float> undef, float %a, i32 0 + %vcge1.i = insertelement <1 x float> undef, float %b, i32 0 + %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i) + %0 = extractelement <1 x i32> %vcge2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcled_f64(double %a, double %b) { +; CHECK: test_vcled_f64 +; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcge.i = insertelement <1 x double> undef, double %a, i32 0 + %vcge1.i = insertelement <1 x double> undef, double %b, i32 0 + %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i) + %0 = extractelement <1 x i64> %vcge2.i, i32 0 + ret i64 %0 +} + +define i32 @test_vclezs_f32(float %a) { +; CHECK: test_vclezs_f32 +; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0 +entry: + %vcle.i = insertelement <1 x float> undef, float %a, i32 0 + %vcle1.i = call <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float> %vcle.i, <1 x float> zeroinitializer) + %0 = extractelement <1 x i32> %vcle1.i, i32 0 + ret i32 %0 +} + +define i64 @test_vclezd_f64(double %a) { +; CHECK: test_vclezd_f64 +; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0 +entry: + %vcle.i = insertelement <1 x double> undef, double %a, i32 0 + %vcle1.i = tail call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double> %vcle.i, <1 x float> zeroinitializer) #5 + %0 = extractelement <1 x i64> %vcle1.i, i32 0 + ret i64 %0 +} + +define i32 @test_vclts_f32(float %a, float %b) { +; CHECK: test_vclts_f32 +; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcgt.i = insertelement <1 x float> undef, float %b, i32 0 + %vcgt1.i = insertelement <1 x float> undef, float %a, i32 0 + %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i) + %0 = extractelement <1 x i32> %vcgt2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcltd_f64(double %a, double %b) { +; CHECK: test_vcltd_f64 +; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcgt.i = insertelement <1 x double> undef, double %b, i32 0 + %vcgt1.i = insertelement <1 x double> undef, double %a, i32 0 + %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i) + %0 = extractelement <1 x i64> %vcgt2.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcltzs_f32(float %a) { +; CHECK: test_vcltzs_f32 +; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0 +entry: + %vclt.i = insertelement <1 x float> undef, float %a, i32 0 + %vclt1.i = call <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float> %vclt.i, <1 x float> zeroinitializer) + %0 = extractelement <1 x i32> %vclt1.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcltzd_f64(double %a) { +; CHECK: test_vcltzd_f64 +; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0 +entry: + %vclt.i = insertelement <1 x double> undef, double %a, i32 0 + %vclt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double> %vclt.i, <1 x float> zeroinitializer) #5 + %0 = extractelement <1 x i64> %vclt1.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcages_f32(float %a, float %b) { +; CHECK: test_vcages_f32 +; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcage.i = insertelement <1 x float> undef, float %a, i32 0 + %vcage1.i = insertelement <1 x float> undef, float %b, i32 0 + %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i) + %0 = extractelement <1 x i32> %vcage2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcaged_f64(double %a, double %b) { +; CHECK: test_vcaged_f64 +; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcage.i = insertelement <1 x double> undef, double %a, i32 0 + %vcage1.i = insertelement <1 x double> undef, double %b, i32 0 + %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i) + %0 = extractelement <1 x i64> %vcage2.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcagts_f32(float %a, float %b) { +; CHECK: test_vcagts_f32 +; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcagt.i = insertelement <1 x float> undef, float %a, i32 0 + %vcagt1.i = insertelement <1 x float> undef, float %b, i32 0 + %vcagt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcagt.i, <1 x float> %vcagt1.i) + %0 = extractelement <1 x i32> %vcagt2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcagtd_f64(double %a, double %b) { +; CHECK: test_vcagtd_f64 +; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcagt.i = insertelement <1 x double> undef, double %a, i32 0 + %vcagt1.i = insertelement <1 x double> undef, double %b, i32 0 + %vcagt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcagt.i, <1 x double> %vcagt1.i) + %0 = extractelement <1 x i64> %vcagt2.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcales_f32(float %a, float %b) { +; CHECK: test_vcales_f32 +; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcage.i = insertelement <1 x float> undef, float %b, i32 0 + %vcage1.i = insertelement <1 x float> undef, float %a, i32 0 + %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i) + %0 = extractelement <1 x i32> %vcage2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcaled_f64(double %a, double %b) { +; CHECK: test_vcaled_f64 +; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcage.i = insertelement <1 x double> undef, double %b, i32 0 + %vcage1.i = insertelement <1 x double> undef, double %a, i32 0 + %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i) + %0 = extractelement <1 x i64> %vcage2.i, i32 0 + ret i64 %0 +} + +define i32 @test_vcalts_f32(float %a, float %b) { +; CHECK: test_vcalts_f32 +; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}} +entry: + %vcalt.i = insertelement <1 x float> undef, float %b, i32 0 + %vcalt1.i = insertelement <1 x float> undef, float %a, i32 0 + %vcalt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcalt.i, <1 x float> %vcalt1.i) + %0 = extractelement <1 x i32> %vcalt2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vcaltd_f64(double %a, double %b) { +; CHECK: test_vcaltd_f64 +; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}} +entry: + %vcalt.i = insertelement <1 x double> undef, double %b, i32 0 + %vcalt1.i = insertelement <1 x double> undef, double %a, i32 0 + %vcalt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcalt.i, <1 x double> %vcalt1.i) + %0 = extractelement <1 x i64> %vcalt2.i, i32 0 + ret i64 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) +declare <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double>, <1 x float>) +declare <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) +declare <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>) +declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>) diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll new file mode 100644 index 0000000000000..991037f6cb88b --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-mul.ll @@ -0,0 +1,143 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) { +; CHECK: test_vqdmulhh_s16 +; CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} + %1 = insertelement <1 x i16> undef, i16 %a, i32 0 + %2 = insertelement <1 x i16> undef, i16 %b, i32 0 + %3 = call <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16> %1, <1 x i16> %2) + %4 = extractelement <1 x i16> %3, i32 0 + ret i16 %4 +} + +define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) { +; CHECK: test_vqdmulhs_s32 +; CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + %1 = insertelement <1 x i32> undef, i32 %a, i32 0 + %2 = insertelement <1 x i32> undef, i32 %b, i32 0 + %3 = call <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32> %1, <1 x i32> %2) + %4 = extractelement <1 x i32> %3, i32 0 + ret i32 %4 +} + +declare <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16>, <1 x i16>) +declare <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32>, <1 x i32>) + +define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) { +; CHECK: test_vqrdmulhh_s16 +; CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} + %1 = insertelement <1 x i16> undef, i16 %a, i32 0 + %2 = insertelement <1 x i16> undef, i16 %b, i32 0 + %3 = call <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16> %1, <1 x i16> %2) + %4 = extractelement <1 x i16> %3, i32 0 + ret i16 %4 +} + +define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) { +; CHECK: test_vqrdmulhs_s32 +; CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + %1 = insertelement <1 x i32> undef, i32 %a, i32 0 + %2 = insertelement <1 x i32> undef, i32 %b, i32 0 + %3 = call <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32> %1, <1 x i32> %2) + %4 = extractelement <1 x i32> %3, i32 0 + ret i32 %4 +} + +declare <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16>, <1 x i16>) +declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>) + +define float @test_vmulxs_f32(float %a, float %b) { +; CHECK: test_vmulxs_f32 +; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b) + ret float %1 +} + +define double @test_vmulxd_f64(double %a, double %b) { +; CHECK: test_vmulxd_f64 +; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} + %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b) + ret double %1 +} + +declare float @llvm.aarch64.neon.vmulx.f32(float, float) +declare double @llvm.aarch64.neon.vmulx.f64(double, double) + +define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) { +; CHECK: test_vqdmlalh_s16 +; CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} +entry: + %vqdmlal.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqdmlal1.i = insertelement <1 x i16> undef, i16 %b, i32 0 + %vqdmlal2.i = insertelement <1 x i16> undef, i16 %c, i32 0 + %vqdmlal3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32> %vqdmlal.i, <1 x i16> %vqdmlal1.i, <1 x i16> %vqdmlal2.i) + %0 = extractelement <1 x i32> %vqdmlal3.i, i32 0 + ret i32 %0 +} + +define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) { +; CHECK: test_vqdmlals_s32 +; CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vqdmlal.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vqdmlal1.i = insertelement <1 x i32> undef, i32 %b, i32 0 + %vqdmlal2.i = insertelement <1 x i32> undef, i32 %c, i32 0 + %vqdmlal3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64> %vqdmlal.i, <1 x i32> %vqdmlal1.i, <1 x i32> %vqdmlal2.i) + %0 = extractelement <1 x i64> %vqdmlal3.i, i32 0 + ret i64 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32>, <1 x i16>, <1 x i16>) +declare <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64>, <1 x i32>, <1 x i32>) + +define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) { +; CHECK: test_vqdmlslh_s16 +; CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} +entry: + %vqdmlsl.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqdmlsl1.i = insertelement <1 x i16> undef, i16 %b, i32 0 + %vqdmlsl2.i = insertelement <1 x i16> undef, i16 %c, i32 0 + %vqdmlsl3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32> %vqdmlsl.i, <1 x i16> %vqdmlsl1.i, <1 x i16> %vqdmlsl2.i) + %0 = extractelement <1 x i32> %vqdmlsl3.i, i32 0 + ret i32 %0 +} + +define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) { +; CHECK: test_vqdmlsls_s32 +; CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vqdmlsl.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vqdmlsl1.i = insertelement <1 x i32> undef, i32 %b, i32 0 + %vqdmlsl2.i = insertelement <1 x i32> undef, i32 %c, i32 0 + %vqdmlsl3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64> %vqdmlsl.i, <1 x i32> %vqdmlsl1.i, <1 x i32> %vqdmlsl2.i) + %0 = extractelement <1 x i64> %vqdmlsl3.i, i32 0 + ret i64 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32>, <1 x i16>, <1 x i16>) +declare <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64>, <1 x i32>, <1 x i32>) + +define i32 @test_vqdmullh_s16(i16 %a, i16 %b) { +; CHECK: test_vqdmullh_s16 +; CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}} +entry: + %vqdmull.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vqdmull1.i = insertelement <1 x i16> undef, i16 %b, i32 0 + %vqdmull2.i = call <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16> %vqdmull.i, <1 x i16> %vqdmull1.i) + %0 = extractelement <1 x i32> %vqdmull2.i, i32 0 + ret i32 %0 +} + +define i64 @test_vqdmulls_s32(i32 %a, i32 %b) { +; CHECK: test_vqdmulls_s32 +; CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vqdmull.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqdmull1.i = insertelement <1 x i32> undef, i32 %b, i32 0 + %vqdmull2.i = call <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32> %vqdmull.i, <1 x i32> %vqdmull1.i) + %0 = extractelement <1 x i64> %vqdmull2.i, i32 0 + ret i64 %0 +} + +declare <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16>, <1 x i16>) +declare <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32>, <1 x i32>) diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll new file mode 100644 index 0000000000000..4dc9d519783db --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-neg.ll @@ -0,0 +1,61 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define i64 @test_vnegd_s64(i64 %a) { +; CHECK: test_vnegd_s64 +; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vneg.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vneg1.i = tail call <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64> %vneg.i) + %0 = extractelement <1 x i64> %vneg1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64>) + +define i8 @test_vqnegb_s8(i8 %a) { +; CHECK: test_vqnegb_s8 +; CHECK: sqneg {{b[0-9]+}}, {{b[0-9]+}} +entry: + %vqneg.i = insertelement <1 x i8> undef, i8 %a, i32 0 + %vqneg1.i = call <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8> %vqneg.i) + %0 = extractelement <1 x i8> %vqneg1.i, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8>) + +define i16 @test_vqnegh_s16(i16 %a) { +; CHECK: test_vqnegh_s16 +; CHECK: sqneg {{h[0-9]+}}, {{h[0-9]+}} +entry: + %vqneg.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vqneg1.i = call <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16> %vqneg.i) + %0 = extractelement <1 x i16> %vqneg1.i, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16>) + +define i32 @test_vqnegs_s32(i32 %a) { +; CHECK: test_vqnegs_s32 +; CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vqneg.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vqneg1.i = call <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32> %vqneg.i) + %0 = extractelement <1 x i32> %vqneg1.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32>) + +define i64 @test_vqnegd_s64(i64 %a) { +; CHECK: test_vqnegd_s64 +; CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vqneg.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vqneg1.i = call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %vqneg.i) + %0 = extractelement <1 x i64> %vqneg1.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll new file mode 100644 index 0000000000000..f21c27bee435d --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-recip.ll @@ -0,0 +1,116 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define float @test_vrecpss_f32(float %a, float %b) { +; CHECK: test_vrecpss_f32 +; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + %1 = insertelement <1 x float> undef, float %a, i32 0 + %2 = insertelement <1 x float> undef, float %b, i32 0 + %3 = call <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float> %1, <1 x float> %2) + %4 = extractelement <1 x float> %3, i32 0 + ret float %4 +} + +define double @test_vrecpsd_f64(double %a, double %b) { +; CHECK: test_vrecpsd_f64 +; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} + %1 = insertelement <1 x double> undef, double %a, i32 0 + %2 = insertelement <1 x double> undef, double %b, i32 0 + %3 = call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %1, <1 x double> %2) + %4 = extractelement <1 x double> %3, i32 0 + ret double %4 +} + +declare <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float>, <1 x float>) +declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>) + +define float @test_vrsqrtss_f32(float %a, float %b) { +; CHECK: test_vrsqrtss_f32 +; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} + %1 = insertelement <1 x float> undef, float %a, i32 0 + %2 = insertelement <1 x float> undef, float %b, i32 0 + %3 = call <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float> %1, <1 x float> %2) + %4 = extractelement <1 x float> %3, i32 0 + ret float %4 +} + +define double @test_vrsqrtsd_f64(double %a, double %b) { +; CHECK: test_vrsqrtsd_f64 +; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} + %1 = insertelement <1 x double> undef, double %a, i32 0 + %2 = insertelement <1 x double> undef, double %b, i32 0 + %3 = call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %1, <1 x double> %2) + %4 = extractelement <1 x double> %3, i32 0 + ret double %4 +} + +declare <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float>, <1 x float>) +declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>) + +define float @test_vrecpes_f32(float %a) { +; CHECK: test_vrecpes_f32 +; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vrecpe.i = insertelement <1 x float> undef, float %a, i32 0 + %vrecpe1.i = tail call <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float> %vrecpe.i) + %0 = extractelement <1 x float> %vrecpe1.i, i32 0 + ret float %0 +} + +define double @test_vrecped_f64(double %a) { +; CHECK: test_vrecped_f64 +; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vrecpe.i = insertelement <1 x double> undef, double %a, i32 0 + %vrecpe1.i = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %vrecpe.i) + %0 = extractelement <1 x double> %vrecpe1.i, i32 0 + ret double %0 +} + +declare <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float>) +declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>) + +define float @test_vrecpxs_f32(float %a) { +; CHECK: test_vrecpxs_f32 +; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vrecpx.i = insertelement <1 x float> undef, float %a, i32 0 + %vrecpx1.i = tail call <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float> %vrecpx.i) + %0 = extractelement <1 x float> %vrecpx1.i, i32 0 + ret float %0 +} + +define double @test_vrecpxd_f64(double %a) { +; CHECK: test_vrecpxd_f64 +; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vrecpx.i = insertelement <1 x double> undef, double %a, i32 0 + %vrecpx1.i = tail call <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double> %vrecpx.i) + %0 = extractelement <1 x double> %vrecpx1.i, i32 0 + ret double %0 +} + +declare <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float>) +declare <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double>) + +define float @test_vrsqrtes_f32(float %a) { +; CHECK: test_vrsqrtes_f32 +; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vrsqrte.i = insertelement <1 x float> undef, float %a, i32 0 + %vrsqrte1.i = tail call <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float> %vrsqrte.i) + %0 = extractelement <1 x float> %vrsqrte1.i, i32 0 + ret float %0 +} + +define double @test_vrsqrted_f64(double %a) { +; CHECK: test_vrsqrted_f64 +; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vrsqrte.i = insertelement <1 x double> undef, double %a, i32 0 + %vrsqrte1.i = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %vrsqrte.i) + %0 = extractelement <1 x double> %vrsqrte1.i, i32 0 + ret double %0 +} + +declare <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float>) +declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>) diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll new file mode 100644 index 0000000000000..80e8dc339d681 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll @@ -0,0 +1,247 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>) + +define <1 x i64> @test_addp_v1i64(<2 x i64> %a) { +; CHECK: test_addp_v1i64: + %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a) +; CHECK: addp d0, v0.2d + ret <1 x i64> %val +} + +declare <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float>) + +define <1 x float> @test_faddp_v1f32(<2 x float> %a) { +; CHECK: test_faddp_v1f32: + %val = call <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float> %a) +; CHECK: faddp s0, v0.2s + ret <1 x float> %val +} + +declare <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double>) + +define <1 x double> @test_faddp_v1f64(<2 x double> %a) { +; CHECK: test_faddp_v1f64: + %val = call <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double> %a) +; CHECK: faddp d0, v0.2d + ret <1 x double> %val +} + + +declare <1 x float> @llvm.aarch64.neon.vpmax(<2 x float>) + +define <1 x float> @test_fmaxp_v1f32(<2 x float> %a) { +; CHECK: test_fmaxp_v1f32: + %val = call <1 x float> @llvm.aarch64.neon.vpmax(<2 x float> %a) +; CHECK: fmaxp s0, v0.2s + ret <1 x float> %val +} + +declare <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double>) + +define <1 x double> @test_fmaxp_v1f64(<2 x double> %a) { +; CHECK: test_fmaxp_v1f64: + %val = call <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double> %a) +; CHECK: fmaxp d0, v0.2d + ret <1 x double> %val +} + + +declare <1 x float> @llvm.aarch64.neon.vpmin(<2 x float>) + +define <1 x float> @test_fminp_v1f32(<2 x float> %a) { +; CHECK: test_fminp_v1f32: + %val = call <1 x float> @llvm.aarch64.neon.vpmin(<2 x float> %a) +; CHECK: fminp s0, v0.2s + ret <1 x float> %val +} + +declare <1 x double> @llvm.aarch64.neon.vpminq(<2 x double>) + +define <1 x double> @test_fminp_v1f64(<2 x double> %a) { +; CHECK: test_fminp_v1f64: + %val = call <1 x double> @llvm.aarch64.neon.vpminq(<2 x double> %a) +; CHECK: fminp d0, v0.2d + ret <1 x double> %val +} + +declare <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float>) + +define <1 x float> @test_fmaxnmp_v1f32(<2 x float> %a) { +; CHECK: test_fmaxnmp_v1f32: + %val = call <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float> %a) +; CHECK: fmaxnmp s0, v0.2s + ret <1 x float> %val +} + +declare <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double>) + +define <1 x double> @test_fmaxnmp_v1f64(<2 x double> %a) { +; CHECK: test_fmaxnmp_v1f64: + %val = call <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double> %a) +; CHECK: fmaxnmp d0, v0.2d + ret <1 x double> %val +} + +declare <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float>) + +define <1 x float> @test_fminnmp_v1f32(<2 x float> %a) { +; CHECK: test_fminnmp_v1f32: + %val = call <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float> %a) +; CHECK: fminnmp s0, v0.2s + ret <1 x float> %val +} + +declare <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double>) + +define <1 x double> @test_fminnmp_v1f64(<2 x double> %a) { +; CHECK: test_fminnmp_v1f64: + %val = call <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double> %a) +; CHECK: fminnmp d0, v0.2d + ret <1 x double> %val +} + +define float @test_vaddv_f32(<2 x float> %a) { +; CHECK-LABEL: test_vaddv_f32 +; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s + %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float> %a) + %2 = extractelement <1 x float> %1, i32 0 + ret float %2 +} + +define float @test_vaddvq_f32(<4 x float> %a) { +; CHECK-LABEL: test_vaddvq_f32 +; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s + %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float> %a) + %2 = extractelement <1 x float> %1, i32 0 + ret float %2 +} + +define double @test_vaddvq_f64(<2 x double> %a) { +; CHECK-LABEL: test_vaddvq_f64 +; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d + %1 = tail call <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double> %a) + %2 = extractelement <1 x double> %1, i32 0 + ret double %2 +} + +define float @test_vmaxv_f32(<2 x float> %a) { +; CHECK-LABEL: test_vmaxv_f32 +; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s + %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float> %a) + %2 = extractelement <1 x float> %1, i32 0 + ret float %2 +} + +define double @test_vmaxvq_f64(<2 x double> %a) { +; CHECK-LABEL: test_vmaxvq_f64 +; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d + %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double> %a) + %2 = extractelement <1 x double> %1, i32 0 + ret double %2 +} + +define float @test_vminv_f32(<2 x float> %a) { +; CHECK-LABEL: test_vminv_f32 +; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s + %1 = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float> %a) + %2 = extractelement <1 x float> %1, i32 0 + ret float %2 +} + +define double @test_vminvq_f64(<2 x double> %a) { +; CHECK-LABEL: test_vminvq_f64 +; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d + %1 = tail call <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double> %a) + %2 = extractelement <1 x double> %1, i32 0 + ret double %2 +} + +define double @test_vmaxnmvq_f64(<2 x double> %a) { +; CHECK-LABEL: test_vmaxnmvq_f64 +; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d + %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double> %a) + %2 = extractelement <1 x double> %1, i32 0 + ret double %2 +} + +define float @test_vmaxnmv_f32(<2 x float> %a) { +; CHECK-LABEL: test_vmaxnmv_f32 +; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s + %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float> %a) + %2 = extractelement <1 x float> %1, i32 0 + ret float %2 +} + +define double @test_vminnmvq_f64(<2 x double> %a) { +; CHECK-LABEL: test_vminnmvq_f64 +; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d + %1 = tail call <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double> %a) + %2 = extractelement <1 x double> %1, i32 0 + ret double %2 +} + +define float @test_vminnmv_f32(<2 x float> %a) { +; CHECK-LABEL: test_vminnmv_f32 +; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s + %1 = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float> %a) + %2 = extractelement <1 x float> %1, i32 0 + ret float %2 +} + +define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vpaddq_s64 +; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %1 +} + +define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vpaddq_u64 +; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %1 +} + +define i64 @test_vaddvq_s64(<2 x i64> %a) { +; CHECK-LABEL: test_vaddvq_s64 +; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d + %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a) + %2 = extractelement <1 x i64> %1, i32 0 + ret i64 %2 +} + +define i64 @test_vaddvq_u64(<2 x i64> %a) { +; CHECK-LABEL: test_vaddvq_u64 +; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d + %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a) + %2 = extractelement <1 x i64> %1, i32 0 + ret i64 %2 +} + +declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>) + +declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>) + +declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float>) + +declare <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double>) + +declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float>) + +declare <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double>) + +declare <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double>) + +declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float>) + +declare <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double>) + +declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float>) + +declare <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double>) + +declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float>) + +declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float>)
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll new file mode 100644 index 0000000000000..83ceb4ebdad56 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + + +declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_urshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_srshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +declare <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_urshl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_srshl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + + + diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll new file mode 100644 index 0000000000000..bd66f80cebb68 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll @@ -0,0 +1,242 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>) +declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>) + +define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_uqadd_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: uqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + ret <1 x i8> %tmp1 +} + +define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_sqadd_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: sqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + ret <1 x i8> %tmp1 +} + +declare <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8>, <1 x i8>) +declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>) + +define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_uqsub_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: uqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + ret <1 x i8> %tmp1 +} + +define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_sqsub_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: sqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + ret <1 x i8> %tmp1 +} + +declare <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16>, <1 x i16>) +declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>) + +define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_uqadd_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: uqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + ret <1 x i16> %tmp1 +} + +define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_sqadd_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: sqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + ret <1 x i16> %tmp1 +} + +declare <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16>, <1 x i16>) +declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>) + +define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_uqsub_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: uqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + ret <1 x i16> %tmp1 +} + +define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_sqsub_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: sqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + ret <1 x i16> %tmp1 +} + +declare <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32>, <1 x i32>) +declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>) + +define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_uqadd_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: uqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + ret <1 x i32> %tmp1 +} + +define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_sqadd_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: sqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + ret <1 x i32> %tmp1 +} + +declare <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32>, <1 x i32>) +declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>) + +define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_uqsub_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: uqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + ret <1 x i32> %tmp1 +} + + +define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_sqsub_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: sqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + ret <1 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqadd_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: uqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqadd_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: sqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqsub_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: uqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqsub_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: sqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define i8 @test_vuqaddb_s8(i8 %a, i8 %b) { +; CHECK: test_vuqaddb_s8 +; CHECK: suqadd {{b[0-9]+}}, {{b[0-9]+}} +entry: + %vuqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0 + %vuqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0 + %vuqadd2.i = call <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8> %vuqadd.i, <1 x i8> %vuqadd1.i) + %0 = extractelement <1 x i8> %vuqadd2.i, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>) + +define i16 @test_vuqaddh_s16(i16 %a, i16 %b) { +; CHECK: test_vuqaddh_s16 +; CHECK: suqadd {{h[0-9]+}}, {{h[0-9]+}} +entry: + %vuqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vuqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0 + %vuqadd2.i = call <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16> %vuqadd.i, <1 x i16> %vuqadd1.i) + %0 = extractelement <1 x i16> %vuqadd2.i, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>) + +define i32 @test_vuqadds_s32(i32 %a, i32 %b) { +; CHECK: test_vuqadds_s32 +; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vuqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vuqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0 + %vuqadd2.i = call <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32> %vuqadd.i, <1 x i32> %vuqadd1.i) + %0 = extractelement <1 x i32> %vuqadd2.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32>, <1 x i32>) + +define i64 @test_vuqaddd_s64(i64 %a, i64 %b) { +; CHECK: test_vuqaddd_s64 +; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vuqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vuqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vuqadd2.i = call <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64> %vuqadd.i, <1 x i64> %vuqadd1.i) + %0 = extractelement <1 x i64> %vuqadd2.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64>, <1 x i64>) + +define i8 @test_vsqaddb_u8(i8 %a, i8 %b) { +; CHECK: test_vsqaddb_u8 +; CHECK: usqadd {{b[0-9]+}}, {{b[0-9]+}} +entry: + %vsqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0 + %vsqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0 + %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %vsqadd.i, <1 x i8> %vsqadd1.i) + %0 = extractelement <1 x i8> %vsqadd2.i, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8>, <1 x i8>) + +define i16 @test_vsqaddh_u16(i16 %a, i16 %b) { +; CHECK: test_vsqaddh_u16 +; CHECK: usqadd {{h[0-9]+}}, {{h[0-9]+}} +entry: + %vsqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0 + %vsqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0 + %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %vsqadd.i, <1 x i16> %vsqadd1.i) + %0 = extractelement <1 x i16> %vsqadd2.i, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16>, <1 x i16>) + +define i32 @test_vsqadds_u32(i32 %a, i32 %b) { +; CHECK: test_vsqadds_u32 +; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}} +entry: + %vsqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0 + %vsqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0 + %vsqadd2.i = call <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32> %vsqadd.i, <1 x i32> %vsqadd1.i) + %0 = extractelement <1 x i32> %vsqadd2.i, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32>, <1 x i32>) + +define i64 @test_vsqaddd_u64(i64 %a, i64 %b) { +; CHECK: test_vsqaddd_u64 +; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}} +entry: + %vsqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vsqadd2.i = call <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64> %vsqadd.i, <1 x i64> %vsqadd1.i) + %0 = extractelement <1 x i64> %vsqadd2.i, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64>, <1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll new file mode 100644 index 0000000000000..0fd67dfa901c9 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll @@ -0,0 +1,94 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqrshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqrshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +declare <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8>, <1 x i8>) +declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>) + +define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_uqrshl_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: uqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + + ret <1 x i8> %tmp1 +} + +define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_sqrshl_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: sqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + ret <1 x i8> %tmp1 +} + +declare <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16>, <1 x i16>) +declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>) + +define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_uqrshl_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: uqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + + ret <1 x i16> %tmp1 +} + +define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_sqrshl_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: sqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + ret <1 x i16> %tmp1 +} + +declare <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32>, <1 x i32>) +declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>) + +define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_uqrshl_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: uqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + + ret <1 x i32> %tmp1 +} + +define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_sqrshl_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: sqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + ret <1 x i32> %tmp1 +} + +declare <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqrshl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqrshl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + + + diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll new file mode 100644 index 0000000000000..8fdea24a36d7f --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +declare <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8>, <1 x i8>) +declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>) + +define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_uqshl_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: uqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + ret <1 x i8> %tmp1 +} + +define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) { +; CHECK: test_sqshl_v1i8_aarch64: + %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs) +;CHECK: sqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}} + ret <1 x i8> %tmp1 +} + +declare <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16>, <1 x i16>) +declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>) + +define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_uqshl_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: uqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + ret <1 x i16> %tmp1 +} + +define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) { +; CHECK: test_sqshl_v1i16_aarch64: + %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs) +;CHECK: sqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}} + ret <1 x i16> %tmp1 +} + +declare <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32>, <1 x i32>) +declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>) + +define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_uqshl_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: uqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + ret <1 x i32> %tmp1 +} + +define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) { +; CHECK: test_sqshl_v1i32_aarch64: + %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs) +;CHECK: sqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}} + ret <1 x i32> %tmp1 +} + +declare <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqshl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqshl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + + diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll new file mode 100644 index 0000000000000..62243618171a3 --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll @@ -0,0 +1,531 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define i64 @test_vshrd_n_s64(i64 %a) { +; CHECK: test_vshrd_n_s64 +; CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsshr = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsshr1 = call <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64> %vsshr, i32 63) + %0 = extractelement <1 x i64> %vsshr1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64>, i32) + +define i64 @test_vshrd_n_u64(i64 %a) { +; CHECK: test_vshrd_n_u64 +; CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vushr = insertelement <1 x i64> undef, i64 %a, i32 0 + %vushr1 = call <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64> %vushr, i32 63) + %0 = extractelement <1 x i64> %vushr1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64>, i32) + +define i64 @test_vrshrd_n_s64(i64 %a) { +; CHECK: test_vrshrd_n_s64 +; CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsrshr = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsrshr1 = call <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64> %vsrshr, i32 63) + %0 = extractelement <1 x i64> %vsrshr1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64>, i32) + +define i64 @test_vrshrd_n_u64(i64 %a) { +; CHECK: test_vrshrd_n_u64 +; CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vurshr = insertelement <1 x i64> undef, i64 %a, i32 0 + %vurshr1 = call <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64> %vurshr, i32 63) + %0 = extractelement <1 x i64> %vurshr1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64>, i32) + +define i64 @test_vsrad_n_s64(i64 %a, i64 %b) { +; CHECK: test_vsrad_n_s64 +; CHECK: ssra {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vssra = insertelement <1 x i64> undef, i64 %a, i32 0 + %vssra1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vssra2 = call <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64> %vssra, <1 x i64> %vssra1, i32 63) + %0 = extractelement <1 x i64> %vssra2, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64>, <1 x i64>, i32) + +define i64 @test_vsrad_n_u64(i64 %a, i64 %b) { +; CHECK: test_vsrad_n_u64 +; CHECK: usra {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vusra = insertelement <1 x i64> undef, i64 %a, i32 0 + %vusra1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vusra2 = call <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64> %vusra, <1 x i64> %vusra1, i32 63) + %0 = extractelement <1 x i64> %vusra2, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64>, <1 x i64>, i32) + +define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) { +; CHECK: test_vrsrad_n_s64 +; CHECK: srsra {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsrsra = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsrsra1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vsrsra2 = call <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64> %vsrsra, <1 x i64> %vsrsra1, i32 63) + %0 = extractelement <1 x i64> %vsrsra2, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64>, <1 x i64>, i32) + +define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) { +; CHECK: test_vrsrad_n_u64 +; CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vursra = insertelement <1 x i64> undef, i64 %a, i32 0 + %vursra1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vursra2 = call <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64> %vursra, <1 x i64> %vursra1, i32 63) + %0 = extractelement <1 x i64> %vursra2, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64>, <1 x i64>, i32) + +define i64 @test_vshld_n_s64(i64 %a) { +; CHECK: test_vshld_n_s64 +; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vshl = insertelement <1 x i64> undef, i64 %a, i32 0 + %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63) + %0 = extractelement <1 x i64> %vshl1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64>, i32) + +define i64 @test_vshld_n_u64(i64 %a) { +; CHECK: test_vshld_n_u64 +; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vshl = insertelement <1 x i64> undef, i64 %a, i32 0 + %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63) + %0 = extractelement <1 x i64> %vshl1, i32 0 + ret i64 %0 +} + +define i8 @test_vqshlb_n_s8(i8 %a) { +; CHECK: test_vqshlb_n_s8 +; CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, #7 +entry: + %vsqshl = insertelement <1 x i8> undef, i8 %a, i32 0 + %vsqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8> %vsqshl, i32 7) + %0 = extractelement <1 x i8> %vsqshl1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8>, i32) + +define i16 @test_vqshlh_n_s16(i16 %a) { +; CHECK: test_vqshlh_n_s16 +; CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, #15 +entry: + %vsqshl = insertelement <1 x i16> undef, i16 %a, i32 0 + %vsqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16> %vsqshl, i32 15) + %0 = extractelement <1 x i16> %vsqshl1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16>, i32) + +define i32 @test_vqshls_n_s32(i32 %a) { +; CHECK: test_vqshls_n_s32 +; CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31 +entry: + %vsqshl = insertelement <1 x i32> undef, i32 %a, i32 0 + %vsqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32> %vsqshl, i32 31) + %0 = extractelement <1 x i32> %vsqshl1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32>, i32) + +define i64 @test_vqshld_n_s64(i64 %a) { +; CHECK: test_vqshld_n_s64 +; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsqshl = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64> %vsqshl, i32 63) + %0 = extractelement <1 x i64> %vsqshl1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64>, i32) + +define i8 @test_vqshlb_n_u8(i8 %a) { +; CHECK: test_vqshlb_n_u8 +; CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, #7 +entry: + %vuqshl = insertelement <1 x i8> undef, i8 %a, i32 0 + %vuqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8> %vuqshl, i32 7) + %0 = extractelement <1 x i8> %vuqshl1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8>, i32) + +define i16 @test_vqshlh_n_u16(i16 %a) { +; CHECK: test_vqshlh_n_u16 +; CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, #15 +entry: + %vuqshl = insertelement <1 x i16> undef, i16 %a, i32 0 + %vuqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16> %vuqshl, i32 15) + %0 = extractelement <1 x i16> %vuqshl1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16>, i32) + +define i32 @test_vqshls_n_u32(i32 %a) { +; CHECK: test_vqshls_n_u32 +; CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31 +entry: + %vuqshl = insertelement <1 x i32> undef, i32 %a, i32 0 + %vuqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32> %vuqshl, i32 31) + %0 = extractelement <1 x i32> %vuqshl1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32>, i32) + +define i64 @test_vqshld_n_u64(i64 %a) { +; CHECK: test_vqshld_n_u64 +; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vuqshl = insertelement <1 x i64> undef, i64 %a, i32 0 + %vuqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64> %vuqshl, i32 63) + %0 = extractelement <1 x i64> %vuqshl1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64>, i32) + +define i8 @test_vqshlub_n_s8(i8 %a) { +; CHECK: test_vqshlub_n_s8 +; CHECK: sqshlu {{b[0-9]+}}, {{b[0-9]+}}, #7 +entry: + %vsqshlu = insertelement <1 x i8> undef, i8 %a, i32 0 + %vsqshlu1 = call <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8> %vsqshlu, i32 7) + %0 = extractelement <1 x i8> %vsqshlu1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8>, i32) + +define i16 @test_vqshluh_n_s16(i16 %a) { +; CHECK: test_vqshluh_n_s16 +; CHECK: sqshlu {{h[0-9]+}}, {{h[0-9]+}}, #15 +entry: + %vsqshlu = insertelement <1 x i16> undef, i16 %a, i32 0 + %vsqshlu1 = call <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16> %vsqshlu, i32 15) + %0 = extractelement <1 x i16> %vsqshlu1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16>, i32) + +define i32 @test_vqshlus_n_s32(i32 %a) { +; CHECK: test_vqshlus_n_s32 +; CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31 +entry: + %vsqshlu = insertelement <1 x i32> undef, i32 %a, i32 0 + %vsqshlu1 = call <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32> %vsqshlu, i32 31) + %0 = extractelement <1 x i32> %vsqshlu1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32>, i32) + +define i64 @test_vqshlud_n_s64(i64 %a) { +; CHECK: test_vqshlud_n_s64 +; CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsqshlu = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsqshlu1 = call <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64> %vsqshlu, i32 63) + %0 = extractelement <1 x i64> %vsqshlu1, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64>, i32) + +define i64 @test_vsrid_n_s64(i64 %a, i64 %b) { +; CHECK: test_vsrid_n_s64 +; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsri = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63) + %0 = extractelement <1 x i64> %vsri2, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64>, <1 x i64>, i32) + +define i64 @test_vsrid_n_u64(i64 %a, i64 %b) { +; CHECK: test_vsrid_n_u64 +; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsri = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63) + %0 = extractelement <1 x i64> %vsri2, i32 0 + ret i64 %0 +} + +define i64 @test_vslid_n_s64(i64 %a, i64 %b) { +; CHECK: test_vslid_n_s64 +; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsli = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63) + %0 = extractelement <1 x i64> %vsli2, i32 0 + ret i64 %0 +} + +declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) + +define i64 @test_vslid_n_u64(i64 %a, i64 %b) { +; CHECK: test_vslid_n_u64 +; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63 +entry: + %vsli = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0 + %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63) + %0 = extractelement <1 x i64> %vsli2, i32 0 + ret i64 %0 +} + +define i8 @test_vqshrnh_n_s16(i16 %a) { +; CHECK: test_vqshrnh_n_s16 +; CHECK: sqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8 +entry: + %vsqshrn = insertelement <1 x i16> undef, i16 %a, i32 0 + %vsqshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16> %vsqshrn, i32 8) + %0 = extractelement <1 x i8> %vsqshrn1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16>, i32) + +define i16 @test_vqshrns_n_s32(i32 %a) { +; CHECK: test_vqshrns_n_s32 +; CHECK: sqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16 +entry: + %vsqshrn = insertelement <1 x i32> undef, i32 %a, i32 0 + %vsqshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32> %vsqshrn, i32 16) + %0 = extractelement <1 x i16> %vsqshrn1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32>, i32) + +define i32 @test_vqshrnd_n_s64(i64 %a) { +; CHECK: test_vqshrnd_n_s64 +; CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 +entry: + %vsqshrn = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsqshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64> %vsqshrn, i32 32) + %0 = extractelement <1 x i32> %vsqshrn1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64>, i32) + +define i8 @test_vqshrnh_n_u16(i16 %a) { +; CHECK: test_vqshrnh_n_u16 +; CHECK: uqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8 +entry: + %vuqshrn = insertelement <1 x i16> undef, i16 %a, i32 0 + %vuqshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16> %vuqshrn, i32 8) + %0 = extractelement <1 x i8> %vuqshrn1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16>, i32) + +define i16 @test_vqshrns_n_u32(i32 %a) { +; CHECK: test_vqshrns_n_u32 +; CHECK: uqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16 +entry: + %vuqshrn = insertelement <1 x i32> undef, i32 %a, i32 0 + %vuqshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32> %vuqshrn, i32 16) + %0 = extractelement <1 x i16> %vuqshrn1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32>, i32) + +define i32 @test_vqshrnd_n_u64(i64 %a) { +; CHECK: test_vqshrnd_n_u64 +; CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 +entry: + %vuqshrn = insertelement <1 x i64> undef, i64 %a, i32 0 + %vuqshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64> %vuqshrn, i32 32) + %0 = extractelement <1 x i32> %vuqshrn1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64>, i32) + +define i8 @test_vqrshrnh_n_s16(i16 %a) { +; CHECK: test_vqrshrnh_n_s16 +; CHECK: sqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8 +entry: + %vsqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0 + %vsqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16> %vsqrshrn, i32 8) + %0 = extractelement <1 x i8> %vsqrshrn1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16>, i32) + +define i16 @test_vqrshrns_n_s32(i32 %a) { +; CHECK: test_vqrshrns_n_s32 +; CHECK: sqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16 +entry: + %vsqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0 + %vsqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32> %vsqrshrn, i32 16) + %0 = extractelement <1 x i16> %vsqrshrn1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32>, i32) + +define i32 @test_vqrshrnd_n_s64(i64 %a) { +; CHECK: test_vqrshrnd_n_s64 +; CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 +entry: + %vsqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64> %vsqrshrn, i32 32) + %0 = extractelement <1 x i32> %vsqrshrn1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64>, i32) + +define i8 @test_vqrshrnh_n_u16(i16 %a) { +; CHECK: test_vqrshrnh_n_u16 +; CHECK: uqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8 +entry: + %vuqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0 + %vuqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16> %vuqrshrn, i32 8) + %0 = extractelement <1 x i8> %vuqrshrn1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16>, i32) + +define i16 @test_vqrshrns_n_u32(i32 %a) { +; CHECK: test_vqrshrns_n_u32 +; CHECK: uqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16 +entry: + %vuqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0 + %vuqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %vuqrshrn, i32 16) + %0 = extractelement <1 x i16> %vuqrshrn1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32) + +define i32 @test_vqrshrnd_n_u64(i64 %a) { +; CHECK: test_vqrshrnd_n_u64 +; CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32 +entry: + %vuqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0 + %vuqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64> %vuqrshrn, i32 32) + %0 = extractelement <1 x i32> %vuqrshrn1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64>, i32) + +define i8 @test_vqshrunh_n_s16(i16 %a) { +; CHECK: test_vqshrunh_n_s16 +; CHECK: sqshrun {{b[0-9]+}}, {{h[0-9]+}}, #8 +entry: + %vsqshrun = insertelement <1 x i16> undef, i16 %a, i32 0 + %vsqshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16> %vsqshrun, i32 8) + %0 = extractelement <1 x i8> %vsqshrun1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16>, i32) + +define i16 @test_vqshruns_n_s32(i32 %a) { +; CHECK: test_vqshruns_n_s32 +; CHECK: sqshrun {{h[0-9]+}}, {{s[0-9]+}}, #16 +entry: + %vsqshrun = insertelement <1 x i32> undef, i32 %a, i32 0 + %vsqshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32> %vsqshrun, i32 16) + %0 = extractelement <1 x i16> %vsqshrun1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32>, i32) + +define i32 @test_vqshrund_n_s64(i64 %a) { +; CHECK: test_vqshrund_n_s64 +; CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32 +entry: + %vsqshrun = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsqshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64> %vsqshrun, i32 32) + %0 = extractelement <1 x i32> %vsqshrun1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64>, i32) + +define i8 @test_vqrshrunh_n_s16(i16 %a) { +; CHECK: test_vqrshrunh_n_s16 +; CHECK: sqrshrun {{b[0-9]+}}, {{h[0-9]+}}, #8 +entry: + %vsqrshrun = insertelement <1 x i16> undef, i16 %a, i32 0 + %vsqrshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16> %vsqrshrun, i32 8) + %0 = extractelement <1 x i8> %vsqrshrun1, i32 0 + ret i8 %0 +} + +declare <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16>, i32) + +define i16 @test_vqrshruns_n_s32(i32 %a) { +; CHECK: test_vqrshruns_n_s32 +; CHECK: sqrshrun {{h[0-9]+}}, {{s[0-9]+}}, #16 +entry: + %vsqrshrun = insertelement <1 x i32> undef, i32 %a, i32 0 + %vsqrshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32> %vsqrshrun, i32 16) + %0 = extractelement <1 x i16> %vsqrshrun1, i32 0 + ret i16 %0 +} + +declare <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32>, i32) + +define i32 @test_vqrshrund_n_s64(i64 %a) { +; CHECK: test_vqrshrund_n_s64 +; CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32 +entry: + %vsqrshrun = insertelement <1 x i64> undef, i64 %a, i32 0 + %vsqrshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64> %vsqrshrun, i32 32) + %0 = extractelement <1 x i32> %vsqrshrun1, i32 0 + ret i32 %0 +} + +declare <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64>, i32) diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll new file mode 100644 index 0000000000000..1222be50cf4bc --- /dev/null +++ b/test/CodeGen/AArch64/neon-scalar-shift.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_ushl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +declare <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_ushl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sshl_v1i64_aarch64: + %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + ret <1 x i64> %tmp1 +} + + diff --git a/test/CodeGen/AArch64/neon-shift-left-long.ll b/test/CodeGen/AArch64/neon-shift-left-long.ll new file mode 100644 index 0000000000000..d45c47685b0f4 --- /dev/null +++ b/test/CodeGen/AArch64/neon-shift-left-long.ll @@ -0,0 +1,193 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i16> @test_sshll_v8i8(<8 x i8> %a) { +; CHECK: test_sshll_v8i8: +; CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3 + %1 = sext <8 x i8> %a to <8 x i16> + %tmp = shl <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_sshll_v4i16(<4 x i16> %a) { +; CHECK: test_sshll_v4i16: +; CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9 + %1 = sext <4 x i16> %a to <4 x i32> + %tmp = shl <4 x i32> %1, <i32 9, i32 9, i32 9, i32 9> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_sshll_v2i32(<2 x i32> %a) { +; CHECK: test_sshll_v2i32: +; CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19 + %1 = sext <2 x i32> %a to <2 x i64> + %tmp = shl <2 x i64> %1, <i64 19, i64 19> + ret <2 x i64> %tmp +} + +define <8 x i16> @test_ushll_v8i8(<8 x i8> %a) { +; CHECK: test_ushll_v8i8: +; CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3 + %1 = zext <8 x i8> %a to <8 x i16> + %tmp = shl <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_ushll_v4i16(<4 x i16> %a) { +; CHECK: test_ushll_v4i16: +; CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9 + %1 = zext <4 x i16> %a to <4 x i32> + %tmp = shl <4 x i32> %1, <i32 9, i32 9, i32 9, i32 9> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_ushll_v2i32(<2 x i32> %a) { +; CHECK: test_ushll_v2i32: +; CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19 + %1 = zext <2 x i32> %a to <2 x i64> + %tmp = shl <2 x i64> %1, <i64 19, i64 19> + ret <2 x i64> %tmp +} + +define <8 x i16> @test_sshll2_v16i8(<16 x i8> %a) { +; CHECK: test_sshll2_v16i8: +; CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3 + %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = sext <8 x i8> %1 to <8 x i16> + %tmp = shl <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_sshll2_v8i16(<8 x i16> %a) { +; CHECK: test_sshll2_v8i16: +; CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %2 = sext <4 x i16> %1 to <4 x i32> + %tmp = shl <4 x i32> %2, <i32 9, i32 9, i32 9, i32 9> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_sshll2_v4i32(<4 x i32> %a) { +; CHECK: test_sshll2_v4i32: +; CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %2 = sext <2 x i32> %1 to <2 x i64> + %tmp = shl <2 x i64> %2, <i64 19, i64 19> + ret <2 x i64> %tmp +} + +define <8 x i16> @test_ushll2_v16i8(<16 x i8> %a) { +; CHECK: test_ushll2_v16i8: +; CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3 + %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %2 = zext <8 x i8> %1 to <8 x i16> + %tmp = shl <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_ushll2_v8i16(<8 x i16> %a) { +; CHECK: test_ushll2_v8i16: +; CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %2 = zext <4 x i16> %1 to <4 x i32> + %tmp = shl <4 x i32> %2, <i32 9, i32 9, i32 9, i32 9> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_ushll2_v4i32(<4 x i32> %a) { +; CHECK: test_ushll2_v4i32: +; CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %2 = zext <2 x i32> %1 to <2 x i64> + %tmp = shl <2 x i64> %2, <i64 19, i64 19> + ret <2 x i64> %tmp +} + +define <8 x i16> @test_sshll_shl0_v8i8(<8 x i8> %a) { +; CHECK: test_sshll_shl0_v8i8: +; CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0 + %tmp = sext <8 x i8> %a to <8 x i16> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_sshll_shl0_v4i16(<4 x i16> %a) { +; CHECK: test_sshll_shl0_v4i16: +; CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0 + %tmp = sext <4 x i16> %a to <4 x i32> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_sshll_shl0_v2i32(<2 x i32> %a) { +; CHECK: test_sshll_shl0_v2i32: +; CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0 + %tmp = sext <2 x i32> %a to <2 x i64> + ret <2 x i64> %tmp +} + +define <8 x i16> @test_ushll_shl0_v8i8(<8 x i8> %a) { +; CHECK: test_ushll_shl0_v8i8: +; CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0 + %tmp = zext <8 x i8> %a to <8 x i16> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_ushll_shl0_v4i16(<4 x i16> %a) { +; CHECK: test_ushll_shl0_v4i16: +; CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0 + %tmp = zext <4 x i16> %a to <4 x i32> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_ushll_shl0_v2i32(<2 x i32> %a) { +; CHECK: test_ushll_shl0_v2i32: +; CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0 + %tmp = zext <2 x i32> %a to <2 x i64> + ret <2 x i64> %tmp +} + +define <8 x i16> @test_sshll2_shl0_v16i8(<16 x i8> %a) { +; CHECK: test_sshll2_shl0_v16i8: +; CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0 + %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %tmp = sext <8 x i8> %1 to <8 x i16> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_sshll2_shl0_v8i16(<8 x i16> %a) { +; CHECK: test_sshll2_shl0_v8i16: +; CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %tmp = sext <4 x i16> %1 to <4 x i32> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_sshll2_shl0_v4i32(<4 x i32> %a) { +; CHECK: test_sshll2_shl0_v4i32: +; CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %tmp = sext <2 x i32> %1 to <2 x i64> + ret <2 x i64> %tmp +} + +define <8 x i16> @test_ushll2_shl0_v16i8(<16 x i8> %a) { +; CHECK: test_ushll2_shl0_v16i8: +; CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0 + %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %tmp = zext <8 x i8> %1 to <8 x i16> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_ushll2_shl0_v8i16(<8 x i16> %a) { +; CHECK: test_ushll2_shl0_v8i16: +; CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0 + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %tmp = zext <4 x i16> %1 to <4 x i32> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_ushll2_shl0_v4i32(<4 x i32> %a) { +; CHECK: test_ushll2_shl0_v4i32: +; CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0 + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %tmp = zext <2 x i32> %1 to <2 x i64> + ret <2 x i64> %tmp +} diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll new file mode 100644 index 0000000000000..33b04ceb4895e --- /dev/null +++ b/test/CodeGen/AArch64/neon-shift.ll @@ -0,0 +1,171 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: ushl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_ushl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: ushl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_ushl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: ushl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_ushl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: ushl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_ushl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: ushl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_ushl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: ushl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_ushl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: ushl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + + +define <8 x i8> @test_shl_v8i8(<8 x i8> %a) { +; CHECK: test_shl_v8i8: +; CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %tmp = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <8 x i8> %tmp +} + +define <4 x i16> @test_shl_v4i16(<4 x i16> %a) { +; CHECK: test_shl_v4i16: +; CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %tmp = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> + ret <4 x i16> %tmp +} + +define <2 x i32> @test_shl_v2i32(<2 x i32> %a) { +; CHECK: test_shl_v2i32: +; CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %tmp = shl <2 x i32> %a, <i32 3, i32 3> + ret <2 x i32> %tmp +} + +define <16 x i8> @test_shl_v16i8(<16 x i8> %a) { +; CHECK: test_shl_v16i8: +; CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %tmp = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <16 x i8> %tmp +} + +define <8 x i16> @test_shl_v8i16(<8 x i16> %a) { +; CHECK: test_shl_v8i16: +; CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %tmp = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %tmp +} + +define <4 x i32> @test_shl_v4i32(<4 x i32> %a) { +; CHECK: test_shl_v4i32: +; CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %tmp = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3> + ret <4 x i32> %tmp +} + +define <2 x i64> @test_shl_v2i64(<2 x i64> %a) { +; CHECK: test_shl_v2i64: +; CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63 + %tmp = shl <2 x i64> %a, <i64 63, i64 63> + ret <2 x i64> %tmp +} + diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll new file mode 100644 index 0000000000000..d5557c0c85622 --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll @@ -0,0 +1,2314 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) { +; CHECK-LABEL: test_ldst1_v16i8: +; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %tmp = load <16 x i8>* %ptr + store <16 x i8> %tmp, <16 x i8>* %ptr2 + ret void +} + +define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) { +; CHECK-LABEL: test_ldst1_v8i16: +; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %tmp = load <8 x i16>* %ptr + store <8 x i16> %tmp, <8 x i16>* %ptr2 + ret void +} + +define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) { +; CHECK-LABEL: test_ldst1_v4i32: +; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %tmp = load <4 x i32>* %ptr + store <4 x i32> %tmp, <4 x i32>* %ptr2 + ret void +} + +define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) { +; CHECK-LABEL: test_ldst1_v2i64: +; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %tmp = load <2 x i64>* %ptr + store <2 x i64> %tmp, <2 x i64>* %ptr2 + ret void +} + +define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) { +; CHECK-LABEL: test_ldst1_v8i8: +; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %tmp = load <8 x i8>* %ptr + store <8 x i8> %tmp, <8 x i8>* %ptr2 + ret void +} + +define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) { +; CHECK-LABEL: test_ldst1_v4i16: +; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %tmp = load <4 x i16>* %ptr + store <4 x i16> %tmp, <4 x i16>* %ptr2 + ret void +} + +define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) { +; CHECK-LABEL: test_ldst1_v2i32: +; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %tmp = load <2 x i32>* %ptr + store <2 x i32> %tmp, <2 x i32>* %ptr2 + ret void +} + +define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) { +; CHECK-LABEL: test_ldst1_v1i64: +; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] +; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %tmp = load <1 x i64>* %ptr + store <1 x i64> %tmp, <1 x i64>* %ptr2 + ret void +} + +%struct.int8x16x2_t = type { [2 x <16 x i8>] } +%struct.int16x8x2_t = type { [2 x <8 x i16>] } +%struct.int32x4x2_t = type { [2 x <4 x i32>] } +%struct.int64x2x2_t = type { [2 x <2 x i64>] } +%struct.float32x4x2_t = type { [2 x <4 x float>] } +%struct.float64x2x2_t = type { [2 x <2 x double>] } +%struct.int8x8x2_t = type { [2 x <8 x i8>] } +%struct.int16x4x2_t = type { [2 x <4 x i16>] } +%struct.int32x2x2_t = type { [2 x <2 x i32>] } +%struct.int64x1x2_t = type { [2 x <1 x i64>] } +%struct.float32x2x2_t = type { [2 x <2 x float>] } +%struct.float64x1x2_t = type { [2 x <1 x double>] } +%struct.int8x16x3_t = type { [3 x <16 x i8>] } +%struct.int16x8x3_t = type { [3 x <8 x i16>] } +%struct.int32x4x3_t = type { [3 x <4 x i32>] } +%struct.int64x2x3_t = type { [3 x <2 x i64>] } +%struct.float32x4x3_t = type { [3 x <4 x float>] } +%struct.float64x2x3_t = type { [3 x <2 x double>] } +%struct.int8x8x3_t = type { [3 x <8 x i8>] } +%struct.int16x4x3_t = type { [3 x <4 x i16>] } +%struct.int32x2x3_t = type { [3 x <2 x i32>] } +%struct.int64x1x3_t = type { [3 x <1 x i64>] } +%struct.float32x2x3_t = type { [3 x <2 x float>] } +%struct.float64x1x3_t = type { [3 x <1 x double>] } +%struct.int8x16x4_t = type { [4 x <16 x i8>] } +%struct.int16x8x4_t = type { [4 x <8 x i16>] } +%struct.int32x4x4_t = type { [4 x <4 x i32>] } +%struct.int64x2x4_t = type { [4 x <2 x i64>] } +%struct.float32x4x4_t = type { [4 x <4 x float>] } +%struct.float64x2x4_t = type { [4 x <2 x double>] } +%struct.int8x8x4_t = type { [4 x <8 x i8>] } +%struct.int16x4x4_t = type { [4 x <4 x i16>] } +%struct.int32x2x4_t = type { [4 x <2 x i32>] } +%struct.int64x1x4_t = type { [4 x <1 x i64>] } +%struct.float32x2x4_t = type { [4 x <2 x float>] } +%struct.float64x1x4_t = type { [4 x <1 x double>] } + + +define <16 x i8> @test_vld1q_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld1q_s8 +; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1) + ret <16 x i8> %vld1 +} + +define <8 x i16> @test_vld1q_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld1q_s16 +; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2) + ret <8 x i16> %vld1 +} + +define <4 x i32> @test_vld1q_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld1q_s32 +; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4) + ret <4 x i32> %vld1 +} + +define <2 x i64> @test_vld1q_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld1q_s64 +; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8) + ret <2 x i64> %vld1 +} + +define <4 x float> @test_vld1q_f32(float* readonly %a) { +; CHECK-LABEL: test_vld1q_f32 +; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4) + ret <4 x float> %vld1 +} + +define <2 x double> @test_vld1q_f64(double* readonly %a) { +; CHECK-LABEL: test_vld1q_f64 +; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8) + ret <2 x double> %vld1 +} + +define <8 x i8> @test_vld1_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld1_s8 +; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1) + ret <8 x i8> %vld1 +} + +define <4 x i16> @test_vld1_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld1_s16 +; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2) + ret <4 x i16> %vld1 +} + +define <2 x i32> @test_vld1_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld1_s32 +; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4) + ret <2 x i32> %vld1 +} + +define <1 x i64> @test_vld1_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld1_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8) + ret <1 x i64> %vld1 +} + +define <2 x float> @test_vld1_f32(float* readonly %a) { +; CHECK-LABEL: test_vld1_f32 +; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4) + ret <2 x float> %vld1 +} + +define <1 x double> @test_vld1_f64(double* readonly %a) { +; CHECK-LABEL: test_vld1_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8) + ret <1 x double> %vld1 +} + +define <8 x i8> @test_vld1_p8(i8* readonly %a) { +; CHECK-LABEL: test_vld1_p8 +; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1) + ret <8 x i8> %vld1 +} + +define <4 x i16> @test_vld1_p16(i16* readonly %a) { +; CHECK-LABEL: test_vld1_p16 +; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2) + ret <4 x i16> %vld1 +} + +define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld2q_s8 +; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1) + %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld2q_s16 +; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2) + %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld2q_s32 +; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld2q_s64 +; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1 + ret %struct.int64x2x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) { +; CHECK-LABEL: test_vld2q_f32 +; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) { +; CHECK-LABEL: test_vld2q_f64 +; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1 + ret %struct.float64x2x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld2_s8 +; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1) + %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld2_s16 +; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2) + %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld2_s32 +; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld2_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1 + ret %struct.int64x1x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) { +; CHECK-LABEL: test_vld2_f32 +; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) { +; CHECK-LABEL: test_vld2_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1 + ret %struct.float64x1x2_t %.fca.0.1.insert +} + +define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld3q_s8 +; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1) + %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2 + ret %struct.int8x16x3_t %.fca.0.2.insert +} + +define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld3q_s16 +; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2) + %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2 + ret %struct.int16x8x3_t %.fca.0.2.insert +} + +define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld3q_s32 +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2 + ret %struct.int32x4x3_t %.fca.0.2.insert +} + +define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld3q_s64 +; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2 + ret %struct.int64x2x3_t %.fca.0.2.insert +} + +define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) { +; CHECK-LABEL: test_vld3q_f32 +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2 + ret %struct.float32x4x3_t %.fca.0.2.insert +} + +define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) { +; CHECK-LABEL: test_vld3q_f64 +; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2 + ret %struct.float64x2x3_t %.fca.0.2.insert +} + +define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld3_s8 +; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1) + %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2 + ret %struct.int8x8x3_t %.fca.0.2.insert +} + +define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld3_s16 +; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2) + %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2 + ret %struct.int16x4x3_t %.fca.0.2.insert +} + +define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld3_s32 +; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2 + ret %struct.int32x2x3_t %.fca.0.2.insert +} + +define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld3_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2 + ret %struct.int64x1x3_t %.fca.0.2.insert +} + +define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) { +; CHECK-LABEL: test_vld3_f32 +; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2 + ret %struct.float32x2x3_t %.fca.0.2.insert +} + +define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) { +; CHECK-LABEL: test_vld3_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2 + ret %struct.float64x1x3_t %.fca.0.2.insert +} + +define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld4q_s8 +; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1) + %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3 + ret %struct.int8x16x4_t %.fca.0.3.insert +} + +define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld4q_s16 +; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2) + %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3 + ret %struct.int16x8x4_t %.fca.0.3.insert +} + +define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld4q_s32 +; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3 + ret %struct.int32x4x4_t %.fca.0.3.insert +} + +define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld4q_s64 +; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3 + ret %struct.int64x2x4_t %.fca.0.3.insert +} + +define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) { +; CHECK-LABEL: test_vld4q_f32 +; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3 + ret %struct.float32x4x4_t %.fca.0.3.insert +} + +define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) { +; CHECK-LABEL: test_vld4q_f64 +; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3 + ret %struct.float64x2x4_t %.fca.0.3.insert +} + +define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) { +; CHECK-LABEL: test_vld4_s8 +; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1) + %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3 + ret %struct.int8x8x4_t %.fca.0.3.insert +} + +define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) { +; CHECK-LABEL: test_vld4_s16 +; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2) + %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3 + ret %struct.int16x4x4_t %.fca.0.3.insert +} + +define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) { +; CHECK-LABEL: test_vld4_s32 +; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3 + ret %struct.int32x2x4_t %.fca.0.3.insert +} + +define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) { +; CHECK-LABEL: test_vld4_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3 + ret %struct.int64x1x4_t %.fca.0.3.insert +} + +define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) { +; CHECK-LABEL: test_vld4_f32 +; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3 + ret %struct.float32x2x4_t %.fca.0.3.insert +} + +define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) { +; CHECK-LABEL: test_vld4_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3 + ret %struct.float64x1x4_t %.fca.0.3.insert +} + +declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) +declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) +declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) +declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) +declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) +declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32) +declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32) +declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) +declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) +declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) +declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) +declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32) +declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) +declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32) +declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32) +declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32) +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32) +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32) +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32) + +define void @test_vst1q_s8(i8* %a, <16 x i8> %b) { +; CHECK-LABEL: test_vst1q_s8 +; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1) + ret void +} + +define void @test_vst1q_s16(i16* %a, <8 x i16> %b) { +; CHECK-LABEL: test_vst1q_s16 +; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2) + ret void +} + +define void @test_vst1q_s32(i32* %a, <4 x i32> %b) { +; CHECK-LABEL: test_vst1q_s32 +; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4) + ret void +} + +define void @test_vst1q_s64(i64* %a, <2 x i64> %b) { +; CHECK-LABEL: test_vst1q_s64 +; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8) + ret void +} + +define void @test_vst1q_f32(float* %a, <4 x float> %b) { +; CHECK-LABEL: test_vst1q_f32 +; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4) + ret void +} + +define void @test_vst1q_f64(double* %a, <2 x double> %b) { +; CHECK-LABEL: test_vst1q_f64 +; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8) + ret void +} + +define void @test_vst1_s8(i8* %a, <8 x i8> %b) { +; CHECK-LABEL: test_vst1_s8 +; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1) + ret void +} + +define void @test_vst1_s16(i16* %a, <4 x i16> %b) { +; CHECK-LABEL: test_vst1_s16 +; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2) + ret void +} + +define void @test_vst1_s32(i32* %a, <2 x i32> %b) { +; CHECK-LABEL: test_vst1_s32 +; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4) + ret void +} + +define void @test_vst1_s64(i64* %a, <1 x i64> %b) { +; CHECK-LABEL: test_vst1_s64 +; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8) + ret void +} + +define void @test_vst1_f32(float* %a, <2 x float> %b) { +; CHECK-LABEL: test_vst1_f32 +; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4) + ret void +} + +define void @test_vst1_f64(double* %a, <1 x double> %b) { +; CHECK-LABEL: test_vst1_f64 +; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8) + ret void +} + +define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst2q_s8 +; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1) + ret void +} + +define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst2q_s16 +; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2) + ret void +} + +define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst2q_s32 +; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst2q_s64 +; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst2q_f32 +; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst2q_f64 +; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst2_s8 +; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1) + ret void +} + +define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst2_s16 +; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2) + ret void +} + +define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst2_s32 +; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst2_s64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst2_f32 +; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst2_f64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst3q_s8 +; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1) + ret void +} + +define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst3q_s16 +; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2) + ret void +} + +define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst3q_s32 +; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst3q_s64 +; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst3q_f32 +; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst3q_f64 +; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst3_s8 +; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1) + ret void +} + +define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst3_s16 +; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2) + ret void +} + +define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst3_s32 +; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst3_s64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst3_f32 +; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst3_f64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst4q_s8 +; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3 + tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1) + ret void +} + +define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst4q_s16 +; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2) + ret void +} + +define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst4q_s32 +; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst4q_s64 +; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8) + ret void +} + +define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst4q_f32 +; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst4q_f64 +; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8) + ret void +} + +define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst4_s8 +; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3 + tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1) + ret void +} + +define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst4_s16 +; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2) + ret void +} + +define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst4_s32 +; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst4_s64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8) + ret void +} + +define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst4_f32 +; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst4_f64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8) + ret void +} + +declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) +declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) +declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) +declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) +declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) +declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32) +declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) +declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) +declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) +declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) +declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) +declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32) +declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) +declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) +declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) +declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32) +declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) +declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32) +declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) +declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) +declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) +declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) +declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) +declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32) +declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) +declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) +declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) +declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32) +declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) +declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32) +declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) +declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) +declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) +declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32) +declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) +declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) +declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) +declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32) +declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) +declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) +declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) +declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) +declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) +declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32) + +define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a) { +; CHECK-LABEL: test_vld1q_s8_x2 +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1) + %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 + %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1 + %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0 + %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1 + ret %struct.int8x16x2_t %5 +} + +define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a) { +; CHECK-LABEL: test_vld1q_s16_x2 +; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2) + %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0 + %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1 + %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0 + %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1 + ret %struct.int16x8x2_t %6 +} + +define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a) { +; CHECK-LABEL: test_vld1q_s32_x2 +; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4) + %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0 + %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1 + %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0 + %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1 + ret %struct.int32x4x2_t %6 +} + +define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a) { +; CHECK-LABEL: test_vld1q_s64_x2 +; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8) + %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0 + %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1 + %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0 + %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1 + ret %struct.int64x2x2_t %6 +} + +define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a) { +; CHECK-LABEL: test_vld1q_f32_x2 +; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4) + %3 = extractvalue { <4 x float>, <4 x float> } %2, 0 + %4 = extractvalue { <4 x float>, <4 x float> } %2, 1 + %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0 + %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1 + ret %struct.float32x4x2_t %6 +} + + +define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a) { +; CHECK-LABEL: test_vld1q_f64_x2 +; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8) + %3 = extractvalue { <2 x double>, <2 x double> } %2, 0 + %4 = extractvalue { <2 x double>, <2 x double> } %2, 1 + %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0 + %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1 + ret %struct.float64x2x2_t %6 +} + +define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a) { +; CHECK-LABEL: test_vld1_s8_x2 +; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1) + %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0 + %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1 + %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0 + %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1 + ret %struct.int8x8x2_t %5 +} + +define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a) { +; CHECK-LABEL: test_vld1_s16_x2 +; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2) + %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0 + %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1 + %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0 + %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1 + ret %struct.int16x4x2_t %6 +} + +define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a) { +; CHECK-LABEL: test_vld1_s32_x2 +; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4) + %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0 + %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1 + %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0 + %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1 + ret %struct.int32x2x2_t %6 +} + +define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a) { +; CHECK-LABEL: test_vld1_s64_x2 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8) + %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0 + %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1 + %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0 + %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1 + ret %struct.int64x1x2_t %6 +} + +define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a) { +; CHECK-LABEL: test_vld1_f32_x2 +; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4) + %3 = extractvalue { <2 x float>, <2 x float> } %2, 0 + %4 = extractvalue { <2 x float>, <2 x float> } %2, 1 + %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0 + %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1 + ret %struct.float32x2x2_t %6 +} + +define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a) { +; CHECK-LABEL: test_vld1_f64_x2 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8) + %3 = extractvalue { <1 x double>, <1 x double> } %2, 0 + %4 = extractvalue { <1 x double>, <1 x double> } %2, 1 + %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0 + %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1 + ret %struct.float64x1x2_t %6 +} + +define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a) { +; CHECK-LABEL: test_vld1q_s8_x3 +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, +; [{{x[0-9]+|sp}}] + %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1) + %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0 + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1 + %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2 + %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0 + %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1 + %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2 + ret %struct.int8x16x3_t %7 +} + +define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a) { +; CHECK-LABEL: test_vld1q_s16_x3 +; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, +; [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2) + %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0 + %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1 + %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2 + %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0 + %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1 + %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2 + ret %struct.int16x8x3_t %8 +} + +define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a) { +; CHECK-LABEL: test_vld1q_s32_x3 +; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, +; [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4) + %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0 + %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1 + %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2 + %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0 + %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1 + %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2 + ret %struct.int32x4x3_t %8 +} + +define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a) { +; CHECK-LABEL: test_vld1q_s64_x3 +; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, +; [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8) + %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0 + %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1 + %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2 + %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0 + %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1 + %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2 + ret %struct.int64x2x3_t %8 +} + +define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a) { +; CHECK-LABEL: test_vld1q_f32_x3 +; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, +; [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4) + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0 + %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1 + %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2 + %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0 + %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1 + %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2 + ret %struct.float32x4x3_t %8 +} + + +define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a) { +; CHECK-LABEL: test_vld1q_f64_x3 +; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, +; [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8) + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0 + %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1 + %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2 + %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0 + %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1 + %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2 + ret %struct.float64x2x3_t %8 +} + +define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a) { +; CHECK-LABEL: test_vld1_s8_x3 +; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, +; [{{x[0-9]+|sp}}] + %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1) + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0 + %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1 + %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2 + %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0 + %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1 + %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2 + ret %struct.int8x8x3_t %7 +} + +define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a) { +; CHECK-LABEL: test_vld1_s16_x3 +; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, +; [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2) + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0 + %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1 + %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2 + %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0 + %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1 + %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2 + ret %struct.int16x4x3_t %8 +} + +define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a) { + %1 = bitcast i32* %a to i8* +; CHECK-LABEL: test_vld1_s32_x3 +; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, +; [{{x[0-9]+|sp}}] + %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4) + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0 + %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1 + %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2 + %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0 + %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1 + %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2 + ret %struct.int32x2x3_t %8 +} + +define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a) { +; CHECK-LABEL: test_vld1_s64_x3 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, +; [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8) + %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0 + %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1 + %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2 + %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0 + %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1 + %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2 + ret %struct.int64x1x3_t %8 +} + +define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a) { +; CHECK-LABEL: test_vld1_f32_x3 +; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, +; [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4) + %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0 + %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1 + %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2 + %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0 + %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1 + %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2 + ret %struct.float32x2x3_t %8 +} + + +define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a) { +; CHECK-LABEL: test_vld1_f64_x3 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, +; [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8) + %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0 + %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1 + %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2 + %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0 + %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1 + %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2 + ret %struct.float64x1x3_t %8 +} + +define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a) { +; CHECK-LABEL: test_vld1q_s8_x4 +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, +; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1) + %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0 + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1 + %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2 + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3 + %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0 + %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1 + %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2 + %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3 + ret %struct.int8x16x4_t %9 +} + +define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a) { +; CHECK-LABEL: test_vld1q_s16_x4 +; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, +; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2) + %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0 + %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1 + %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2 + %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3 + %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0 + %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1 + %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2 + %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3 + ret %struct.int16x8x4_t %10 +} + +define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a) { +; CHECK-LABEL: test_vld1q_s32_x4 +; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, +; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4) + %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0 + %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1 + %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2 + %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3 + %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0 + %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1 + %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2 + %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3 + ret %struct.int32x4x4_t %10 +} + +define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a) { +; CHECK-LABEL: test_vld1q_s64_x4 +; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, +; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8) + %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0 + %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1 + %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2 + %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3 + %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0 + %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1 + %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2 + %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3 + ret %struct.int64x2x4_t %10 +} + +define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a) { +; CHECK-LABEL: test_vld1q_f32_x4 +; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, +; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4) + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0 + %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1 + %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2 + %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3 + %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0 + %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1 + %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2 + %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3 + ret %struct.float32x4x4_t %10 +} + +define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a) { +; CHECK-LABEL: test_vld1q_f64_x4 +; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, +; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8) + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0 + %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1 + %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2 + %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3 + %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0 + %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1 + %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2 + %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3 + ret %struct.float64x2x4_t %10 +} + +define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a) { +; CHECK-LABEL: test_vld1_s8_x4 +; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, +; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1) + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0 + %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1 + %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2 + %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3 + %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0 + %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1 + %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2 + %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3 + ret %struct.int8x8x4_t %9 +} + +define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a) { +; CHECK-LABEL: test_vld1_s16_x4 +; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, +; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2) + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0 + %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1 + %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2 + %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3 + %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0 + %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1 + %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2 + %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3 + ret %struct.int16x4x4_t %10 +} + +define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a) { +; CHECK-LABEL: test_vld1_s32_x4 +; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, +; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4) + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0 + %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1 + %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2 + %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3 + %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0 + %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1 + %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2 + %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3 + ret %struct.int32x2x4_t %10 +} + +define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a) { +; CHECK-LABEL: test_vld1_s64_x4 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, +; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8) + %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0 + %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1 + %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2 + %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3 + %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0 + %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1 + %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2 + %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3 + ret %struct.int64x1x4_t %10 +} + +define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a) { +; CHECK-LABEL: test_vld1_f32_x4 +; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, +; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4) + %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0 + %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1 + %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2 + %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3 + %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0 + %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1 + %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2 + %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3 + ret %struct.float32x2x4_t %10 +} + + +define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a) { +; CHECK-LABEL: test_vld1_f64_x4 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, +; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8) + %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0 + %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1 + %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2 + %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3 + %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0 + %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1 + %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2 + %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3 + ret %struct.float64x1x4_t %10 +} + +define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b) { +; CHECK-LABEL: test_vst1q_s8_x2 +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <16 x i8>] %b, 0 + %2 = extractvalue [2 x <16 x i8>] %b, 1 + tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1) + ret void +} + +define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b) { +; CHECK-LABEL: test_vst1q_s16_x2 +; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <8 x i16>] %b, 0 + %2 = extractvalue [2 x <8 x i16>] %b, 1 + %3 = bitcast i16* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2) + ret void +} + +define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b) { +; CHECK-LABEL: test_vst1q_s32_x2 +; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <4 x i32>] %b, 0 + %2 = extractvalue [2 x <4 x i32>] %b, 1 + %3 = bitcast i32* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4) + ret void +} + +define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b) { +; CHECK-LABEL: test_vst1q_s64_x2 +; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <2 x i64>] %b, 0 + %2 = extractvalue [2 x <2 x i64>] %b, 1 + %3 = bitcast i64* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8) + ret void +} + +define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b) { +; CHECK-LABEL: test_vst1q_f32_x2 +; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <4 x float>] %b, 0 + %2 = extractvalue [2 x <4 x float>] %b, 1 + %3 = bitcast float* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4) + ret void +} + + +define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b) { +; CHECK-LABEL: test_vst1q_f64_x2 +; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <2 x double>] %b, 0 + %2 = extractvalue [2 x <2 x double>] %b, 1 + %3 = bitcast double* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8) + ret void +} + +define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b) { +; CHECK-LABEL: test_vst1_s8_x2 +; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <8 x i8>] %b, 0 + %2 = extractvalue [2 x <8 x i8>] %b, 1 + tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1) + ret void +} + +define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b) { +; CHECK-LABEL: test_vst1_s16_x2 +; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <4 x i16>] %b, 0 + %2 = extractvalue [2 x <4 x i16>] %b, 1 + %3 = bitcast i16* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2) + ret void +} + +define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b) { +; CHECK-LABEL: test_vst1_s32_x2 +; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <2 x i32>] %b, 0 + %2 = extractvalue [2 x <2 x i32>] %b, 1 + %3 = bitcast i32* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4) + ret void +} + +define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b) { +; CHECK-LABEL: test_vst1_s64_x2 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <1 x i64>] %b, 0 + %2 = extractvalue [2 x <1 x i64>] %b, 1 + %3 = bitcast i64* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8) + ret void +} + +define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b) { +; CHECK-LABEL: test_vst1_f32_x2 +; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <2 x float>] %b, 0 + %2 = extractvalue [2 x <2 x float>] %b, 1 + %3 = bitcast float* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4) + ret void +} + +define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b) { +; CHECK-LABEL: test_vst1_f64_x2 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [2 x <1 x double>] %b, 0 + %2 = extractvalue [2 x <1 x double>] %b, 1 + %3 = bitcast double* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8) + ret void +} + +define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b) { +; CHECK-LABEL: test_vst1q_s8_x3 +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <16 x i8>] %b, 0 + %2 = extractvalue [3 x <16 x i8>] %b, 1 + %3 = extractvalue [3 x <16 x i8>] %b, 2 + tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1) + ret void +} + +define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b) { +; CHECK-LABEL: test_vst1q_s16_x3 +; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <8 x i16>] %b, 0 + %2 = extractvalue [3 x <8 x i16>] %b, 1 + %3 = extractvalue [3 x <8 x i16>] %b, 2 + %4 = bitcast i16* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2) + ret void +} + +define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b) { +; CHECK-LABEL: test_vst1q_s32_x3 +; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <4 x i32>] %b, 0 + %2 = extractvalue [3 x <4 x i32>] %b, 1 + %3 = extractvalue [3 x <4 x i32>] %b, 2 + %4 = bitcast i32* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4) + ret void +} + +define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b) { +; CHECK-LABEL: test_vst1q_s64_x3 +; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <2 x i64>] %b, 0 + %2 = extractvalue [3 x <2 x i64>] %b, 1 + %3 = extractvalue [3 x <2 x i64>] %b, 2 + %4 = bitcast i64* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8) + ret void +} + +define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b) { +; CHECK-LABEL: test_vst1q_f32_x3 +; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <4 x float>] %b, 0 + %2 = extractvalue [3 x <4 x float>] %b, 1 + %3 = extractvalue [3 x <4 x float>] %b, 2 + %4 = bitcast float* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4) + ret void +} + +define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b) { +; CHECK-LABEL: test_vst1q_f64_x3 +; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <2 x double>] %b, 0 + %2 = extractvalue [3 x <2 x double>] %b, 1 + %3 = extractvalue [3 x <2 x double>] %b, 2 + %4 = bitcast double* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8) + ret void +} + +define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b) { +; CHECK-LABEL: test_vst1_s8_x3 +; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <8 x i8>] %b, 0 + %2 = extractvalue [3 x <8 x i8>] %b, 1 + %3 = extractvalue [3 x <8 x i8>] %b, 2 + tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1) + ret void +} + +define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b) { +; CHECK-LABEL: test_vst1_s16_x3 +; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <4 x i16>] %b, 0 + %2 = extractvalue [3 x <4 x i16>] %b, 1 + %3 = extractvalue [3 x <4 x i16>] %b, 2 + %4 = bitcast i16* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2) + ret void +} + +define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b) { +; CHECK-LABEL: test_vst1_s32_x3 +; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <2 x i32>] %b, 0 + %2 = extractvalue [3 x <2 x i32>] %b, 1 + %3 = extractvalue [3 x <2 x i32>] %b, 2 + %4 = bitcast i32* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4) + ret void +} + +define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b) { +; CHECK-LABEL: test_vst1_s64_x3 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <1 x i64>] %b, 0 + %2 = extractvalue [3 x <1 x i64>] %b, 1 + %3 = extractvalue [3 x <1 x i64>] %b, 2 + %4 = bitcast i64* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8) + ret void +} + +define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b) { +; CHECK-LABEL: test_vst1_f32_x3 +; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <2 x float>] %b, 0 + %2 = extractvalue [3 x <2 x float>] %b, 1 + %3 = extractvalue [3 x <2 x float>] %b, 2 + %4 = bitcast float* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4) + ret void +} + +define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b) { +; CHECK-LABEL: test_vst1_f64_x3 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, +; [{{x[0-9]+|sp}}] + %1 = extractvalue [3 x <1 x double>] %b, 0 + %2 = extractvalue [3 x <1 x double>] %b, 1 + %3 = extractvalue [3 x <1 x double>] %b, 2 + %4 = bitcast double* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8) + ret void +} + +define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b) { +; CHECK-LABEL: test_vst1q_s8_x4 +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, +; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <16 x i8>] %b, 0 + %2 = extractvalue [4 x <16 x i8>] %b, 1 + %3 = extractvalue [4 x <16 x i8>] %b, 2 + %4 = extractvalue [4 x <16 x i8>] %b, 3 + tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1) + ret void +} + +define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b) { +; CHECK-LABEL: test_vst1q_s16_x4 +; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, +; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <8 x i16>] %b, 0 + %2 = extractvalue [4 x <8 x i16>] %b, 1 + %3 = extractvalue [4 x <8 x i16>] %b, 2 + %4 = extractvalue [4 x <8 x i16>] %b, 3 + %5 = bitcast i16* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2) + ret void +} + +define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b) { +; CHECK-LABEL: test_vst1q_s32_x4 +; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, +; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <4 x i32>] %b, 0 + %2 = extractvalue [4 x <4 x i32>] %b, 1 + %3 = extractvalue [4 x <4 x i32>] %b, 2 + %4 = extractvalue [4 x <4 x i32>] %b, 3 + %5 = bitcast i32* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4) + ret void +} + +define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b) { +; CHECK-LABEL: test_vst1q_s64_x4 +; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, +; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <2 x i64>] %b, 0 + %2 = extractvalue [4 x <2 x i64>] %b, 1 + %3 = extractvalue [4 x <2 x i64>] %b, 2 + %4 = extractvalue [4 x <2 x i64>] %b, 3 + %5 = bitcast i64* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8) + ret void +} + +define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b) { +; CHECK-LABEL: test_vst1q_f32_x4 +; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, +; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <4 x float>] %b, 0 + %2 = extractvalue [4 x <4 x float>] %b, 1 + %3 = extractvalue [4 x <4 x float>] %b, 2 + %4 = extractvalue [4 x <4 x float>] %b, 3 + %5 = bitcast float* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4) + ret void +} + +define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b) { +; CHECK-LABEL: test_vst1q_f64_x4 +; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, +; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <2 x double>] %b, 0 + %2 = extractvalue [4 x <2 x double>] %b, 1 + %3 = extractvalue [4 x <2 x double>] %b, 2 + %4 = extractvalue [4 x <2 x double>] %b, 3 + %5 = bitcast double* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8) + ret void +} + +define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b) { +; CHECK-LABEL: test_vst1_s8_x4 +; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, +; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <8 x i8>] %b, 0 + %2 = extractvalue [4 x <8 x i8>] %b, 1 + %3 = extractvalue [4 x <8 x i8>] %b, 2 + %4 = extractvalue [4 x <8 x i8>] %b, 3 + tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1) + ret void +} + +define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b) { +; CHECK-LABEL: test_vst1_s16_x4 +; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, +; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <4 x i16>] %b, 0 + %2 = extractvalue [4 x <4 x i16>] %b, 1 + %3 = extractvalue [4 x <4 x i16>] %b, 2 + %4 = extractvalue [4 x <4 x i16>] %b, 3 + %5 = bitcast i16* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2) + ret void +} + +define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b) { +; CHECK-LABEL: test_vst1_s32_x4 +; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, +; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <2 x i32>] %b, 0 + %2 = extractvalue [4 x <2 x i32>] %b, 1 + %3 = extractvalue [4 x <2 x i32>] %b, 2 + %4 = extractvalue [4 x <2 x i32>] %b, 3 + %5 = bitcast i32* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4) + ret void +} + +define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b) { +; CHECK-LABEL: test_vst1_s64_x4 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, +; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <1 x i64>] %b, 0 + %2 = extractvalue [4 x <1 x i64>] %b, 1 + %3 = extractvalue [4 x <1 x i64>] %b, 2 + %4 = extractvalue [4 x <1 x i64>] %b, 3 + %5 = bitcast i64* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8) + ret void +} + +define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b) { +; CHECK-LABEL: test_vst1_f32_x4 +; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, +; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <2 x float>] %b, 0 + %2 = extractvalue [4 x <2 x float>] %b, 1 + %3 = extractvalue [4 x <2 x float>] %b, 2 + %4 = extractvalue [4 x <2 x float>] %b, 3 + %5 = bitcast float* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4) + ret void +} + +define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b) { +; CHECK-LABEL: test_vst1_f64_x4 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, +; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = extractvalue [4 x <1 x double>] %b, 0 + %2 = extractvalue [4 x <1 x double>] %b, 1 + %3 = extractvalue [4 x <1 x double>] %b, 2 + %4 = extractvalue [4 x <1 x double>] %b, 3 + %5 = bitcast double* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8) + ret void +} + +declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32) +declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32) +declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32) +declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32) +declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32) +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32) +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32) +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32) +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32) +declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) +declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) +declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) +declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32) +declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32) +declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32) +declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) +declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) +declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) +declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) +declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32) +declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32) +declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) +declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) +declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) +declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32) +declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) +declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32) +declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) +declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) +declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) +declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32) +declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) +declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) +declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) +declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32) +declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) +declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) +declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) +declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) +declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) +declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32) diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll new file mode 100644 index 0000000000000..3f28320f23d59 --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll @@ -0,0 +1,2113 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +%struct.int8x16x2_t = type { [2 x <16 x i8>] } +%struct.int16x8x2_t = type { [2 x <8 x i16>] } +%struct.int32x4x2_t = type { [2 x <4 x i32>] } +%struct.int64x2x2_t = type { [2 x <2 x i64>] } +%struct.float32x4x2_t = type { [2 x <4 x float>] } +%struct.float64x2x2_t = type { [2 x <2 x double>] } +%struct.int8x8x2_t = type { [2 x <8 x i8>] } +%struct.int16x4x2_t = type { [2 x <4 x i16>] } +%struct.int32x2x2_t = type { [2 x <2 x i32>] } +%struct.int64x1x2_t = type { [2 x <1 x i64>] } +%struct.float32x2x2_t = type { [2 x <2 x float>] } +%struct.float64x1x2_t = type { [2 x <1 x double>] } +%struct.int8x16x3_t = type { [3 x <16 x i8>] } +%struct.int16x8x3_t = type { [3 x <8 x i16>] } +%struct.int32x4x3_t = type { [3 x <4 x i32>] } +%struct.int64x2x3_t = type { [3 x <2 x i64>] } +%struct.float32x4x3_t = type { [3 x <4 x float>] } +%struct.float64x2x3_t = type { [3 x <2 x double>] } +%struct.int8x8x3_t = type { [3 x <8 x i8>] } +%struct.int16x4x3_t = type { [3 x <4 x i16>] } +%struct.int32x2x3_t = type { [3 x <2 x i32>] } +%struct.int64x1x3_t = type { [3 x <1 x i64>] } +%struct.float32x2x3_t = type { [3 x <2 x float>] } +%struct.float64x1x3_t = type { [3 x <1 x double>] } +%struct.int8x16x4_t = type { [4 x <16 x i8>] } +%struct.int16x8x4_t = type { [4 x <8 x i16>] } +%struct.int32x4x4_t = type { [4 x <4 x i32>] } +%struct.int64x2x4_t = type { [4 x <2 x i64>] } +%struct.float32x4x4_t = type { [4 x <4 x float>] } +%struct.float64x2x4_t = type { [4 x <2 x double>] } +%struct.int8x8x4_t = type { [4 x <8 x i8>] } +%struct.int16x4x4_t = type { [4 x <4 x i16>] } +%struct.int32x2x4_t = type { [4 x <2 x i32>] } +%struct.int64x1x4_t = type { [4 x <1 x i64>] } +%struct.float32x2x4_t = type { [4 x <2 x float>] } +%struct.float64x1x4_t = type { [4 x <1 x double>] } + +define <16 x i8> @test_vld1q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld1q_dup_s8 +; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0] +entry: + %0 = load i8* %a, align 1 + %1 = insertelement <16 x i8> undef, i8 %0, i32 0 + %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %lane +} + +define <8 x i16> @test_vld1q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld1q_dup_s16 +; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0] +entry: + %0 = load i16* %a, align 2 + %1 = insertelement <8 x i16> undef, i16 %0, i32 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %lane +} + +define <4 x i32> @test_vld1q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld1q_dup_s32 +; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0] +entry: + %0 = load i32* %a, align 4 + %1 = insertelement <4 x i32> undef, i32 %0, i32 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %lane +} + +define <2 x i64> @test_vld1q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld1q_dup_s64 +; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0] +entry: + %0 = load i64* %a, align 8 + %1 = insertelement <2 x i64> undef, i64 %0, i32 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %lane +} + +define <4 x float> @test_vld1q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld1q_dup_f32 +; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0] +entry: + %0 = load float* %a, align 4 + %1 = insertelement <4 x float> undef, float %0, i32 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %lane +} + +define <2 x double> @test_vld1q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld1q_dup_f64 +; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0] +entry: + %0 = load double* %a, align 8 + %1 = insertelement <2 x double> undef, double %0, i32 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + ret <2 x double> %lane +} + +define <8 x i8> @test_vld1_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld1_dup_s8 +; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0] +entry: + %0 = load i8* %a, align 1 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %lane +} + +define <4 x i16> @test_vld1_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld1_dup_s16 +; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0] +entry: + %0 = load i16* %a, align 2 + %1 = insertelement <4 x i16> undef, i16 %0, i32 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %lane +} + +define <2 x i32> @test_vld1_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld1_dup_s32 +; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0] +entry: + %0 = load i32* %a, align 4 + %1 = insertelement <2 x i32> undef, i32 %0, i32 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + ret <2 x i32> %lane +} + +define <1 x i64> @test_vld1_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld1_dup_s64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load i64* %a, align 8 + %1 = insertelement <1 x i64> undef, i64 %0, i32 0 + ret <1 x i64> %1 +} + +define <2 x float> @test_vld1_dup_f32(float* %a) { +; CHECK-LABEL: test_vld1_dup_f32 +; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0] +entry: + %0 = load float* %a, align 4 + %1 = insertelement <2 x float> undef, float %0, i32 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + ret <2 x float> %lane +} + +define <1 x double> @test_vld1_dup_f64(double* %a) { +; CHECK-LABEL: test_vld1_dup_f64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load double* %a, align 8 + %1 = insertelement <1 x double> undef, double %0, i32 0 + ret <1 x double> %1 +} + +define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld2q_dup_s8 +; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0] +entry: + %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0 + %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1 + %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld2q_dup_s16 +; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1 + %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld2q_dup_s32 +; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1 + %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld2q_dup_s64 +; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8) + %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1 + %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1 + ret %struct.int64x2x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld2q_dup_f32 +; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4) + %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1 + %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld2q_dup_f64 +; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1 + %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1 + ret %struct.float64x2x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld2_dup_s8 +; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0] +entry: + %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0 + %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer + %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld2_dup_s16 +; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1 + %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld2_dup_s32 +; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1 + %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld2_dup_s64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1 + %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1 + ret %struct.int64x1x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) { +; CHECK-LABEL: test_vld2_dup_f32 +; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4) + %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1 + %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) { +; CHECK-LABEL: test_vld2_dup_f64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1 + %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1 + ret %struct.float64x1x2_t %.fca.0.1.insert +} + +define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld3q_dup_s8 +; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0] +entry: + %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0 + %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1 + %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2 + %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2 + ret %struct.int8x16x3_t %.fca.0.2.insert +} + +define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld3q_dup_s16 +; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1 + %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer + %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2 + %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2 + ret %struct.int16x8x3_t %.fca.0.2.insert +} + +define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld3q_dup_s32 +; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1 + %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2 + %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2 + ret %struct.int32x4x3_t %.fca.0.2.insert +} + +define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld3q_dup_s64 +; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8) + %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1 + %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2 + %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2 + ret %struct.int64x2x3_t %.fca.0.2.insert +} + +define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld3q_dup_f32 +; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4) + %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1 + %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2 + %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2 + ret %struct.float32x4x3_t %.fca.0.2.insert +} + +define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld3q_dup_f64 +; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1 + %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2 + %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2 + ret %struct.float64x2x3_t %.fca.0.2.insert +} + +define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld3_dup_s8 +; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0] +entry: + %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0 + %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer + %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2 + %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2 + ret %struct.int8x8x3_t %.fca.0.2.insert +} + +define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld3_dup_s16 +; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1 + %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2 + %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2 + ret %struct.int16x4x3_t %.fca.0.2.insert +} + +define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld3_dup_s32 +; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1 + %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2 + %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2 + ret %struct.int32x2x3_t %.fca.0.2.insert +} + +define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld3_dup_s64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2 + %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2 + ret %struct.int64x1x3_t %.fca.0.2.insert +} + +define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) { +; CHECK-LABEL: test_vld3_dup_f32 +; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4) + %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1 + %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2 + %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2 + ret %struct.float32x2x3_t %.fca.0.2.insert +} + +define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) { +; CHECK-LABEL: test_vld3_dup_f64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2 + %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2 + ret %struct.float64x1x3_t %.fca.0.2.insert +} + +define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld4q_dup_s8 +; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0] +entry: + %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0 + %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1 + %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer + %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2 + %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3 + %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3 + ret %struct.int8x16x4_t %.fca.0.3.insert +} + +define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld4q_dup_s16 +; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0 + %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1 + %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer + %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2 + %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer + %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3 + %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3 + ret %struct.int16x8x4_t %.fca.0.3.insert +} + +define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld4q_dup_s32 +; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0 + %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1 + %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2 + %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3 + %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3 + ret %struct.int32x4x4_t %.fca.0.3.insert +} + +define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld4q_dup_s64 +; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8) + %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0 + %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1 + %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2 + %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3 + %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3 + ret %struct.int64x2x4_t %.fca.0.3.insert +} + +define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) { +; CHECK-LABEL: test_vld4q_dup_f32 +; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4) + %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0 + %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1 + %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2 + %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer + %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3 + %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3 + ret %struct.float32x4x4_t %.fca.0.3.insert +} + +define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) { +; CHECK-LABEL: test_vld4q_dup_f64 +; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0 + %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1 + %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2 + %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3 + %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3 + ret %struct.float64x2x4_t %.fca.0.3.insert +} + +define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) { +; CHECK-LABEL: test_vld4_dup_s8 +; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0] +entry: + %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0 + %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer + %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2 + %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3 + %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3 + ret %struct.int8x8x4_t %.fca.0.3.insert +} + +define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) { +; CHECK-LABEL: test_vld4_dup_s16 +; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0] +entry: + %0 = bitcast i16* %a to i8* + %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0 + %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer + %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1 + %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2 + %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer + %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3 + %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3 + ret %struct.int16x4x4_t %.fca.0.3.insert +} + +define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) { +; CHECK-LABEL: test_vld4_dup_s32 +; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast i32* %a to i8* + %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0 + %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1 + %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2 + %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3 + %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3 + ret %struct.int32x2x4_t %.fca.0.3.insert +} + +define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) { +; CHECK-LABEL: test_vld4_dup_s64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast i64* %a to i8* + %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2 + %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3 + %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3 + ret %struct.int64x1x4_t %.fca.0.3.insert +} + +define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) { +; CHECK-LABEL: test_vld4_dup_f32 +; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0] +entry: + %0 = bitcast float* %a to i8* + %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4) + %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0 + %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer + %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1 + %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer + %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2 + %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer + %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3 + %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer + %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3 + ret %struct.float32x2x4_t %.fca.0.3.insert +} + +define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) { +; CHECK-LABEL: test_vld4_dup_f64 +; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0] +entry: + %0 = bitcast double* %a to i8* + %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8) + %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0 + %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1 + %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2 + %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3 + %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3 + ret %struct.float64x1x4_t %.fca.0.3.insert +} + +define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) { +; CHECK-LABEL: test_vld1q_lane_s8 +; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = load i8* %a, align 1 + %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15 + ret <16 x i8> %vld1_lane +} + +define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) { +; CHECK-LABEL: test_vld1q_lane_s16 +; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = load i16* %a, align 2 + %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7 + ret <8 x i16> %vld1_lane +} + +define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) { +; CHECK-LABEL: test_vld1q_lane_s32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load i32* %a, align 4 + %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3 + ret <4 x i32> %vld1_lane +} + +define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) { +; CHECK-LABEL: test_vld1q_lane_s64 +; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = load i64* %a, align 8 + %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1 + ret <2 x i64> %vld1_lane +} + +define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) { +; CHECK-LABEL: test_vld1q_lane_f32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load float* %a, align 4 + %vld1_lane = insertelement <4 x float> %b, float %0, i32 3 + ret <4 x float> %vld1_lane +} + +define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) { +; CHECK-LABEL: test_vld1q_lane_f64 +; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = load double* %a, align 8 + %vld1_lane = insertelement <2 x double> %b, double %0, i32 1 + ret <2 x double> %vld1_lane +} + +define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) { +; CHECK-LABEL: test_vld1_lane_s8 +; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = load i8* %a, align 1 + %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7 + ret <8 x i8> %vld1_lane +} + +define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) { +; CHECK-LABEL: test_vld1_lane_s16 +; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = load i16* %a, align 2 + %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3 + ret <4 x i16> %vld1_lane +} + +define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) { +; CHECK-LABEL: test_vld1_lane_s32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load i32* %a, align 4 + %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1 + ret <2 x i32> %vld1_lane +} + +define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) { +; CHECK-LABEL: test_vld1_lane_s64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load i64* %a, align 8 + %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0 + ret <1 x i64> %vld1_lane +} + +define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) { +; CHECK-LABEL: test_vld1_lane_f32 +; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = load float* %a, align 4 + %vld1_lane = insertelement <2 x float> %b, float %0, i32 1 + ret <2 x float> %vld1_lane +} + +define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) { +; CHECK-LABEL: test_vld1_lane_f64 +; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0] +entry: + %0 = load double* %a, align 8 + %vld1_lane = insertelement <1 x double> undef, double %0, i32 0 + ret <1 x double> %vld1_lane +} + +define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_s16 +; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2) + %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_s32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_s64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int64x2x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_f32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vld2q_lane_f64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float64x2x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s8 +; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1) + %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s16 +; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2) + %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_s64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.int64x1x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_f32 +; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4) + %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vld2_lane_f64 +; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8) + %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.float64x1x2_t %.fca.0.1.insert +} + +define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_s16 +; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int16x8x3_t %.fca.0.2.insert +} + +define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_s32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int32x4x3_t %.fca.0.2.insert +} + +define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_s64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int64x2x3_t %.fca.0.2.insert +} + +define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_f32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float32x4x3_t %.fca.0.2.insert +} + +define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vld3q_lane_f64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float64x2x3_t %.fca.0.2.insert +} + +define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s8 +; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int8x8x3_t %.fca.0.2.insert +} + +define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s16 +; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int16x4x3_t %.fca.0.2.insert +} + +define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int32x2x3_t %.fca.0.2.insert +} + +define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_s64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.int64x1x3_t %.fca.0.2.insert +} + +define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_f32 +; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float32x2x3_t %.fca.0.2.insert +} + +define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vld3_lane_f64 +; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.float64x1x3_t %.fca.0.2.insert +} + +define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s8 +; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int8x16x4_t %.fca.0.3.insert +} + +define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s16 +; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int16x8x4_t %.fca.0.3.insert +} + +define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int32x4x4_t %.fca.0.3.insert +} + +define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_s64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int64x2x4_t %.fca.0.3.insert +} + +define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_f32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float32x4x4_t %.fca.0.3.insert +} + +define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vld4q_lane_f64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float64x2x4_t %.fca.0.3.insert +} + +define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s8 +; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1) + %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int8x8x4_t %.fca.0.3.insert +} + +define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s16 +; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2) + %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int16x4x4_t %.fca.0.3.insert +} + +define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int32x2x4_t %.fca.0.3.insert +} + +define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_s64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.int64x1x4_t %.fca.0.3.insert +} + +define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_f32 +; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4) + %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float32x2x4_t %.fca.0.3.insert +} + +define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vld4_lane_f64 +; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8) + %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2 + %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3 + %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3 + ret %struct.float64x1x4_t %.fca.0.3.insert +} + +define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) { +; CHECK-LABEL: test_vst1q_lane_s8 +; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <16 x i8> %b, i32 15 + store i8 %0, i8* %a, align 1 + ret void +} + +define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) { +; CHECK-LABEL: test_vst1q_lane_s16 +; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <8 x i16> %b, i32 7 + store i16 %0, i16* %a, align 2 + ret void +} + +define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) { +; CHECK-LABEL: test_vst1q_lane_s32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <4 x i32> %b, i32 3 + store i32 %0, i32* %a, align 4 + ret void +} + +define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) { +; CHECK-LABEL: test_vst1q_lane_s64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x i64> %b, i32 1 + store i64 %0, i64* %a, align 8 + ret void +} + +define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) { +; CHECK-LABEL: test_vst1q_lane_f32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <4 x float> %b, i32 3 + store float %0, float* %a, align 4 + ret void +} + +define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) { +; CHECK-LABEL: test_vst1q_lane_f64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x double> %b, i32 1 + store double %0, double* %a, align 8 + ret void +} + +define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) { +; CHECK-LABEL: test_vst1_lane_s8 +; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <8 x i8> %b, i32 7 + store i8 %0, i8* %a, align 1 + ret void +} + +define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) { +; CHECK-LABEL: test_vst1_lane_s16 +; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <4 x i16> %b, i32 3 + store i16 %0, i16* %a, align 2 + ret void +} + +define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) { +; CHECK-LABEL: test_vst1_lane_s32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x i32> %b, i32 1 + store i32 %0, i32* %a, align 4 + ret void +} + +define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) { +; CHECK-LABEL: test_vst1_lane_s64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <1 x i64> %b, i32 0 + store i64 %0, i64* %a, align 8 + ret void +} + +define void @test_vst1_lane_f32(float* %a, <2 x float> %b) { +; CHECK-LABEL: test_vst1_lane_f32 +; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <2 x float> %b, i32 1 + store float %0, float* %a, align 4 + ret void +} + +define void @test_vst1_lane_f64(double* %a, <1 x double> %b) { +; CHECK-LABEL: test_vst1_lane_f64 +; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %0 = extractelement <1 x double> %b, i32 0 + store double %0, double* %a, align 8 + ret void +} + +define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s8 +; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1) + ret void +} + +define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s16 +; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2) + ret void +} + +define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4) + ret void +} + +define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_s64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8) + ret void +} + +define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_f32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4) + ret void +} + +define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst2q_lane_f64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8) + ret void +} + +define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s8 +; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1) + ret void +} + +define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s16 +; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2) + ret void +} + +define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4) + ret void +} + +define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_s64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8) + ret void +} + +define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_f32 +; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4) + ret void +} + +define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst2_lane_f64 +; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8) + ret void +} + +define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s8 +; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1) + ret void +} + +define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s16 +; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2) + ret void +} + +define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4) + ret void +} + +define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_s64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8) + ret void +} + +define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_f32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4) + ret void +} + +define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst3q_lane_f64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8) + ret void +} + +define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s8 +; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1) + ret void +} + +define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s16 +; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2) + ret void +} + +define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4) + ret void +} + +define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_s64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8) + ret void +} + +define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_f32 +; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4) + ret void +} + +define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst3_lane_f64 +; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8) + ret void +} + +define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s8 +; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2) + ret void +} + +define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s16 +; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2) + ret void +} + +define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4) + ret void +} + +define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_s64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8) + ret void +} + +define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_f32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4) + ret void +} + +define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) { +; CHECK-LABEL: test_vst4q_lane_f64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8) + ret void +} + +define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s8 +; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3 + tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1) + ret void +} + +define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s16 +; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3 + %0 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2) + ret void +} + +define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3 + %0 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4) + ret void +} + +define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_s64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3 + %0 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8) + ret void +} + +define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_f32 +; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3 + %0 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4) + ret void +} + +define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) { +; CHECK-LABEL: test_vst4_lane_f64 +; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0] +entry: + %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3 + %0 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8) + ret void +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) +declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32) +declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) +declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32) +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) +declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) +declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32) +declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) +declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32) +declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32) +declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-simd-ldst.ll b/test/CodeGen/AArch64/neon-simd-ldst.ll new file mode 100644 index 0000000000000..afc0901bbc0bb --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-ldst.ll @@ -0,0 +1,164 @@ +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define void @test_ldstq_4v(i8* noalias %io, i32 %count) { +; CHECK-LABEL: test_ldstq_4v +; CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] +; CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] +entry: + %tobool62 = icmp eq i32 %count, 0 + br i1 %tobool62, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %count.addr.063 = phi i32 [ %dec, %while.body ], [ %count, %entry ] + %dec = add i32 %count.addr.063, -1 + %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %io, i32 1) + %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3 + tail call void @llvm.arm.neon.vst4.v16i8(i8* %io, <16 x i8> %vld4.fca.0.extract, <16 x i8> %vld4.fca.1.extract, <16 x i8> %vld4.fca.2.extract, <16 x i8> %vld4.fca.3.extract, i32 1) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32) + +declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) + +define void @test_ldstq_3v(i8* noalias %io, i32 %count) { +; CHECK-LABEL: test_ldstq_3v +; CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0] +; CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0] +entry: + %tobool47 = icmp eq i32 %count, 0 + br i1 %tobool47, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %count.addr.048 = phi i32 [ %dec, %while.body ], [ %count, %entry ] + %dec = add i32 %count.addr.048, -1 + %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %io, i32 1) + %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2 + tail call void @llvm.arm.neon.vst3.v16i8(i8* %io, <16 x i8> %vld3.fca.0.extract, <16 x i8> %vld3.fca.1.extract, <16 x i8> %vld3.fca.2.extract, i32 1) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) + +declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) + +define void @test_ldstq_2v(i8* noalias %io, i32 %count) { +; CHECK-LABEL: test_ldstq_2v +; CHECK: ld2 {v0.16b, v1.16b}, [x0] +; CHECK: st2 {v0.16b, v1.16b}, [x0] +entry: + %tobool22 = icmp eq i32 %count, 0 + br i1 %tobool22, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ] + %dec = add i32 %count.addr.023, -1 + %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %io, i32 1) + %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1 + tail call void @llvm.arm.neon.vst2.v16i8(i8* %io, <16 x i8> %vld2.fca.0.extract, <16 x i8> %vld2.fca.1.extract, i32 1) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32) + +declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) + +define void @test_ldst_4v(i8* noalias %io, i32 %count) { +; CHECK-LABEL: test_ldst_4v +; CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] +; CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] +entry: + %tobool42 = icmp eq i32 %count, 0 + br i1 %tobool42, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %count.addr.043 = phi i32 [ %dec, %while.body ], [ %count, %entry ] + %dec = add i32 %count.addr.043, -1 + %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %io, i32 1) + %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3 + tail call void @llvm.arm.neon.vst4.v8i8(i8* %io, <8 x i8> %vld4.fca.0.extract, <8 x i8> %vld4.fca.1.extract, <8 x i8> %vld4.fca.2.extract, <8 x i8> %vld4.fca.3.extract, i32 1) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32) + +declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) + +define void @test_ldst_3v(i8* noalias %io, i32 %count) { +; CHECK-LABEL: test_ldst_3v +; CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0] +; CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0] +entry: + %tobool32 = icmp eq i32 %count, 0 + br i1 %tobool32, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %count.addr.033 = phi i32 [ %dec, %while.body ], [ %count, %entry ] + %dec = add i32 %count.addr.033, -1 + %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %io, i32 1) + %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2 + tail call void @llvm.arm.neon.vst3.v8i8(i8* %io, <8 x i8> %vld3.fca.0.extract, <8 x i8> %vld3.fca.1.extract, <8 x i8> %vld3.fca.2.extract, i32 1) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32) + +declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) + +define void @test_ldst_2v(i8* noalias %io, i32 %count) { +; CHECK-LABEL: test_ldst_2v +; CHECK: ld2 {v0.8b, v1.8b}, [x0] +; CHECK: st2 {v0.8b, v1.8b}, [x0] +entry: + %tobool22 = icmp eq i32 %count, 0 + br i1 %tobool22, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ] + %dec = add i32 %count.addr.023, -1 + %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %io, i32 1) + %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1 + tail call void @llvm.arm.neon.vst2.v8i8(i8* %io, <8 x i8> %vld2.fca.0.extract, <8 x i8> %vld2.fca.1.extract, i32 1) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32) + +declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) + diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll new file mode 100644 index 0000000000000..156fe1db0ff56 --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll @@ -0,0 +1,354 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +;Check for a post-increment updating load. +define <4 x i16> @test_vld1_fx_update(i16** %ptr) nounwind { +; CHECK: test_vld1_fx_update +; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #8 + %A = load i16** %ptr + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 2) + %tmp2 = getelementptr i16* %A, i32 4 + store i16* %tmp2, i16** %ptr + ret <4 x i16> %tmp1 +} + +;Check for a post-increment updating load with register increment. +define <2 x i32> @test_vld1_reg_update(i32** %ptr, i32 %inc) nounwind { +; CHECK: test_vld1_reg_update +; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %A = load i32** %ptr + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 4) + %tmp2 = getelementptr i32* %A, i32 %inc + store i32* %tmp2, i32** %ptr + ret <2 x i32> %tmp1 +} + +define <2 x float> @test_vld2_fx_update(float** %ptr) nounwind { +; CHECK: test_vld2_fx_update +; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16 + %A = load float** %ptr + %tmp0 = bitcast float* %A to i8* + %tmp1 = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 4) + %tmp2 = extractvalue { <2 x float>, <2 x float> } %tmp1, 0 + %tmp3 = getelementptr float* %A, i32 4 + store float* %tmp3, float** %ptr + ret <2 x float> %tmp2 +} + +define <16 x i8> @test_vld2_reg_update(i8** %ptr, i32 %inc) nounwind { +; CHECK: test_vld2_reg_update +; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %A = load i8** %ptr + %tmp0 = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1) + %tmp1 = extractvalue { <16 x i8>, <16 x i8> } %tmp0, 0 + %tmp2 = getelementptr i8* %A, i32 %inc + store i8* %tmp2, i8** %ptr + ret <16 x i8> %tmp1 +} + +define <4 x i32> @test_vld3_fx_update(i32** %ptr) nounwind { +; CHECK: test_vld3_fx_update +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #48 + %A = load i32** %ptr + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 4) + %tmp2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %tmp1, 0 + %tmp3 = getelementptr i32* %A, i32 12 + store i32* %tmp3, i32** %ptr + ret <4 x i32> %tmp2 +} + +define <4 x i16> @test_vld3_reg_update(i16** %ptr, i32 %inc) nounwind { +; CHECK: test_vld3_reg_update +; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %A = load i16** %ptr + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 2) + %tmp2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %tmp1, 0 + %tmp3 = getelementptr i16* %A, i32 %inc + store i16* %tmp3, i16** %ptr + ret <4 x i16> %tmp2 +} + +define <8 x i16> @test_vld4_fx_update(i16** %ptr) nounwind { +; CHECK: test_vld4_fx_update +; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], #64 + %A = load i16** %ptr + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8) + %tmp2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %tmp1, 0 + %tmp3 = getelementptr i16* %A, i32 32 + store i16* %tmp3, i16** %ptr + ret <8 x i16> %tmp2 +} + +define <8 x i8> @test_vld4_reg_update(i8** %ptr, i32 %inc) nounwind { +; CHECK: test_vld4_reg_update +; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %A = load i8** %ptr + %tmp0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1) + %tmp1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %tmp0, 0 + %tmp2 = getelementptr i8* %A, i32 %inc + store i8* %tmp2, i8** %ptr + ret <8 x i8> %tmp1 +} + +define void @test_vst1_fx_update(float** %ptr, <2 x float> %B) nounwind { +; CHECK: test_vst1_fx_update +; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #8 + %A = load float** %ptr + %tmp0 = bitcast float* %A to i8* + call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %B, i32 4) + %tmp2 = getelementptr float* %A, i32 2 + store float* %tmp2, float** %ptr + ret void +} + +define void @test_vst1_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind { +; CHECK: test_vst1_reg_update +; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}} + %A = load i16** %ptr + %tmp0 = bitcast i16* %A to i8* + call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %B, i32 2) + %tmp1 = getelementptr i16* %A, i32 %inc + store i16* %tmp1, i16** %ptr + ret void +} + +define void @test_vst2_fx_update(i64** %ptr, <1 x i64> %B) nounwind { +; CHECK: test_vst2_fx_update +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}], #16 + %A = load i64** %ptr + %tmp0 = bitcast i64* %A to i8* + call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %B, <1 x i64> %B, i32 8) + %tmp1 = getelementptr i64* %A, i32 2 + store i64* %tmp1, i64** %ptr + ret void +} + +define void @test_vst2_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind { +; CHECK: test_vst2_reg_update +; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}} + %A = load i8** %ptr + call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, i32 4) + %tmp0 = getelementptr i8* %A, i32 %inc + store i8* %tmp0, i8** %ptr + ret void +} + +define void @test_vst3_fx_update(i32** %ptr, <2 x i32> %B) nounwind { +; CHECK: test_vst3_fx_update +; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #24 + %A = load i32** %ptr + %tmp0 = bitcast i32* %A to i8* + call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %B, <2 x i32> %B, <2 x i32> %B, i32 4) + %tmp1 = getelementptr i32* %A, i32 6 + store i32* %tmp1, i32** %ptr + ret void +} + +define void @test_vst3_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind { +; CHECK: test_vst3_reg_update +; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}} + %A = load i16** %ptr + %tmp0 = bitcast i16* %A to i8* + call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %B, <8 x i16> %B, <8 x i16> %B, i32 2) + %tmp1 = getelementptr i16* %A, i32 %inc + store i16* %tmp1, i16** %ptr + ret void +} + +define void @test_vst4_fx_update(float** %ptr, <4 x float> %B) nounwind { +; CHECK: test_vst4_fx_update +; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}], #64 + %A = load float** %ptr + %tmp0 = bitcast float* %A to i8* + call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %B, <4 x float> %B, <4 x float> %B, <4 x float> %B, i32 4) + %tmp1 = getelementptr float* %A, i32 16 + store float* %tmp1, float** %ptr + ret void +} + +define void @test_vst4_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind { +; CHECK: test_vst4_reg_update +; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}} + %A = load i8** %ptr + call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, i32 1) + %tmp0 = getelementptr i8* %A, i32 %inc + store i8* %tmp0, i8** %ptr + ret void +} + + +declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) +declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32) +declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32) + +declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) +declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) +declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) +declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) +declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) +declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) +declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) + +define <16 x i8> @test_vld1x2_fx_update(i8* %a, i8** %ptr) { +; CHECK: test_vld1x2_fx_update +; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32 + %1 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1) + %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 + %tmp1 = getelementptr i8* %a, i32 32 + store i8* %tmp1, i8** %ptr + ret <16 x i8> %2 +} + +define <8 x i16> @test_vld1x2_reg_update(i16* %a, i16** %ptr, i32 %inc) { +; CHECK: test_vld1x2_reg_update +; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = bitcast i16* %a to i8* + %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2) + %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0 + %tmp1 = getelementptr i16* %a, i32 %inc + store i16* %tmp1, i16** %ptr + ret <8 x i16> %3 +} + +define <2 x i64> @test_vld1x3_fx_update(i64* %a, i64** %ptr) { +; CHECK: test_vld1x3_fx_update +; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], #48 + %1 = bitcast i64* %a to i8* + %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8) + %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0 + %tmp1 = getelementptr i64* %a, i32 6 + store i64* %tmp1, i64** %ptr + ret <2 x i64> %3 +} + +define <8 x i16> @test_vld1x3_reg_update(i16* %a, i16** %ptr, i32 %inc) { +; CHECK: test_vld1x3_reg_update +; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = bitcast i16* %a to i8* + %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2) + %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0 + %tmp1 = getelementptr i16* %a, i32 %inc + store i16* %tmp1, i16** %ptr + ret <8 x i16> %3 +} + +define <4 x float> @test_vld1x4_fx_update(float* %a, float** %ptr) { +; CHECK: test_vld1x4_fx_update +; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64 + %1 = bitcast float* %a to i8* + %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4) + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0 + %tmp1 = getelementptr float* %a, i32 16 + store float* %tmp1, float** %ptr + ret <4 x float> %3 +} + +define <8 x i8> @test_vld1x4_reg_update(i8* readonly %a, i8** %ptr, i32 %inc) #0 { +; CHECK: test_vld1x4_reg_update +; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1) + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0 + %tmp1 = getelementptr i8* %a, i32 %inc + store i8* %tmp1, i8** %ptr + ret <8 x i8> %2 +} + +define void @test_vst1x2_fx_update(i8* %a, [2 x <16 x i8>] %b.coerce, i8** %ptr) #2 { +; CHECK: test_vst1x2_fx_update +; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32 + %1 = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %2 = extractvalue [2 x <16 x i8>] %b.coerce, 1 + tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1) + %tmp1 = getelementptr i8* %a, i32 32 + store i8* %tmp1, i8** %ptr + ret void +} + +define void @test_vst1x2_reg_update(i16* %a, [2 x <8 x i16>] %b.coerce, i16** %ptr, i32 %inc) #2 { +; CHECK: test_vst1x2_reg_update +; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [2 x <8 x i16>] %b.coerce, 0 + %2 = extractvalue [2 x <8 x i16>] %b.coerce, 1 + %3 = bitcast i16* %a to i8* + tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2) + %tmp1 = getelementptr i16* %a, i32 %inc + store i16* %tmp1, i16** %ptr + ret void +} + +define void @test_vst1x3_fx_update(i32* %a, [3 x <2 x i32>] %b.coerce, i32** %ptr) #2 { +; CHECK: test_vst1x3_fx_update +; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #24 + %1 = extractvalue [3 x <2 x i32>] %b.coerce, 0 + %2 = extractvalue [3 x <2 x i32>] %b.coerce, 1 + %3 = extractvalue [3 x <2 x i32>] %b.coerce, 2 + %4 = bitcast i32* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4) + %tmp1 = getelementptr i32* %a, i32 6 + store i32* %tmp1, i32** %ptr + ret void +} + +define void @test_vst1x3_reg_update(i64* %a, [3 x <1 x i64>] %b.coerce, i64** %ptr, i32 %inc) #2 { +; CHECK: test_vst1x3_reg_update +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [3 x <1 x i64>] %b.coerce, 0 + %2 = extractvalue [3 x <1 x i64>] %b.coerce, 1 + %3 = extractvalue [3 x <1 x i64>] %b.coerce, 2 + %4 = bitcast i64* %a to i8* + tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8) + %tmp1 = getelementptr i64* %a, i32 %inc + store i64* %tmp1, i64** %ptr + ret void +} + +define void @test_vst1x4_fx_update(float* %a, [4 x <4 x float>] %b.coerce, float** %ptr) #2 { +; CHECK: test_vst1x4_fx_update +; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64 + %1 = extractvalue [4 x <4 x float>] %b.coerce, 0 + %2 = extractvalue [4 x <4 x float>] %b.coerce, 1 + %3 = extractvalue [4 x <4 x float>] %b.coerce, 2 + %4 = extractvalue [4 x <4 x float>] %b.coerce, 3 + %5 = bitcast float* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4) + %tmp1 = getelementptr float* %a, i32 16 + store float* %tmp1, float** %ptr + ret void +} + +define void @test_vst1x4_reg_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr, i32 %inc) #2 { +; CHECK: test_vst1x4_reg_update +; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [4 x <2 x double>] %b.coerce, 0 + %2 = extractvalue [4 x <2 x double>] %b.coerce, 1 + %3 = extractvalue [4 x <2 x double>] %b.coerce, 2 + %4 = extractvalue [4 x <2 x double>] %b.coerce, 3 + %5 = bitcast double* %a to i8* + tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8) + %tmp1 = getelementptr double* %a, i32 %inc + store double* %tmp1, double** %ptr + ret void +} + +declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32) +declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) +declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) +declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #3 +declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) #3 diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll new file mode 100644 index 0000000000000..80a934700c6be --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll @@ -0,0 +1,319 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) { +; CHECK-LABEL: test_vld2q_dup_fx_update +; CHECK: ld2r {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2 + %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1) + %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 + %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1 + %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer + %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0 + %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1 + %tmp1 = getelementptr i8* %a, i32 2 + store i8* %tmp1, i8** %ptr + ret { [2 x <16 x i8>] } %7 +} + +define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld2q_dup_reg_update +; CHECK: ld2r {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = bitcast i32* %a to i8* + %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4) + %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer + %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1 + %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer + %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0 + %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1 + %tmp1 = getelementptr i32* %a, i32 %inc + store i32* %tmp1, i32** %ptr + ret { [2 x <4 x i32>] } %8 +} + +define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) { +; CHECK-LABEL: test_vld3_dup_fx_update +; CHECK: ld3r {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6 + %1 = bitcast i16* %a to i8* + %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0 + %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer + %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1 + %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer + %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2 + %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer + %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %4, 0, 0 + %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %6, 0, 1 + %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2 + %tmp1 = getelementptr i16* %a, i32 3 + store i16* %tmp1, i16** %ptr + ret { [3 x <4 x i16>] } %11 +} + +define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld3_dup_reg_update +; CHECK: ld3r {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0 + %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1 + %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer + %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2 + %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer + %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0 + %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1 + %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2 + %tmp1 = getelementptr i8* %a, i32 %inc + store i8* %tmp1, i8** %ptr + ret { [3 x <8 x i8>] }%10 +} + +define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 { +; CHECK-LABEL: test_vld4_dup_fx_update +; CHECK: ld4r {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16 + %1 = bitcast i32* %a to i8* + %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4) + %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0 + %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer + %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1 + %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer + %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2 + %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer + %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3 + %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer + %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0 + %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1 + %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2 + %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3 + %tmp1 = getelementptr i32* %a, i32 4 + store i32* %tmp1, i32** %ptr + ret { [4 x <2 x i32>] } %14 +} + +define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld4_dup_reg_update +; CHECK: ld4r {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = bitcast double* %a to i8* + %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8) + %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0 + %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer + %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1 + %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer + %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2 + %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer + %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3 + %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer + %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0 + %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1 + %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2 + %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3 + %tmp1 = getelementptr double* %a, i32 %inc + store double* %tmp1, double** %ptr + ret { [4 x <2 x double>] } %14 +} + +define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) { +; CHECK-LABEL: test_vld2_lane_fx_update +; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2 + %1 = extractvalue [2 x <8 x i8>] %b, 0 + %2 = extractvalue [2 x <8 x i8>] %b, 1 + %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1) + %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0 + %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1 + %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0 + %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1 + %tmp1 = getelementptr i8* %a, i32 2 + store i8* %tmp1, i8** %ptr + ret { [2 x <8 x i8>] } %7 +} + +define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld2_lane_reg_update +; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [2 x <8 x i8>] %b, 0 + %2 = extractvalue [2 x <8 x i8>] %b, 1 + %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1) + %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0 + %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1 + %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0 + %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1 + %tmp1 = getelementptr i8* %a, i32 %inc + store i8* %tmp1, i8** %ptr + ret { [2 x <8 x i8>] } %7 +} + +define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) { +; CHECK-LABEL: test_vld3_lane_fx_update +; CHECK: ld3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12 + %1 = extractvalue [3 x <2 x float>] %b, 0 + %2 = extractvalue [3 x <2 x float>] %b, 1 + %3 = extractvalue [3 x <2 x float>] %b, 2 + %4 = bitcast float* %a to i8* + %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4) + %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0 + %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1 + %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2 + %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0 + %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1 + %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2 + %tmp1 = getelementptr float* %a, i32 3 + store float* %tmp1, float** %ptr + ret { [3 x <2 x float>] } %11 +} + +define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld3_lane_reg_update +; CHECK: ld3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [3 x <4 x i16>] %b, 0 + %2 = extractvalue [3 x <4 x i16>] %b, 1 + %3 = extractvalue [3 x <4 x i16>] %b, 2 + %4 = bitcast i16* %a to i8* + %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2) + %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0 + %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1 + %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2 + %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0 + %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1 + %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2 + %tmp1 = getelementptr i16* %a, i32 %inc + store i16* %tmp1, i16** %ptr + ret { [3 x <4 x i16>] } %11 +} + +define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) { +; CHECK-LABEL: test_vld4_lane_fx_update +; CHECK: ld4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16 + %1 = extractvalue [4 x <2 x i32>] %b, 0 + %2 = extractvalue [4 x <2 x i32>] %b, 1 + %3 = extractvalue [4 x <2 x i32>] %b, 2 + %4 = extractvalue [4 x <2 x i32>] %b, 3 + %5 = bitcast i32* %a to i8* + %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4) + %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0 + %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1 + %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2 + %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3 + %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0 + %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1 + %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2 + %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3 + %tmp1 = getelementptr i32* %a, i32 4 + store i32* %tmp1, i32** %ptr + ret { [4 x <2 x i32>] } %14 +} + +define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) { +; CHECK-LABEL: test_vld4_lane_reg_update +; CHECK: ld4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [4 x <2 x double>] %b, 0 + %2 = extractvalue [4 x <2 x double>] %b, 1 + %3 = extractvalue [4 x <2 x double>] %b, 2 + %4 = extractvalue [4 x <2 x double>] %b, 3 + %5 = bitcast double* %a to i8* + %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8) + %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0 + %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1 + %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2 + %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3 + %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0 + %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1 + %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2 + %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3 + %tmp1 = getelementptr double* %a, i32 %inc + store double* %tmp1, double** %ptr + ret { [4 x <2 x double>] } %14 +} + +define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) { +; CHECK-LABEL: test_vst2_lane_fx_update +; CHECK: st2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2 + %1 = extractvalue [2 x <8 x i8>] %b, 0 + %2 = extractvalue [2 x <8 x i8>] %b, 1 + call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1) + %tmp1 = getelementptr i8* %a, i32 2 + store i8* %tmp1, i8** %ptr + ret void +} + +define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) { +; CHECK-LABEL: test_vst2_lane_reg_update +; CHECK: st2 {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %3 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4) + %tmp1 = getelementptr i32* %a, i32 %inc + store i32* %tmp1, i32** %ptr + ret void +} + +define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) { +; CHECK-LABEL: test_vst3_lane_fx_update +; CHECK: st3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12 + %1 = extractvalue [3 x <4 x float>] %b, 0 + %2 = extractvalue [3 x <4 x float>] %b, 1 + %3 = extractvalue [3 x <4 x float>] %b, 2 + %4 = bitcast float* %a to i8* + call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4) + %tmp1 = getelementptr float* %a, i32 3 + store float* %tmp1, float** %ptr + ret void +} + +; Function Attrs: nounwind +define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) { +; CHECK-LABEL: test_vst3_lane_reg_update +; CHECK: st3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [3 x <4 x i16>] %b, 0 + %2 = extractvalue [3 x <4 x i16>] %b, 1 + %3 = extractvalue [3 x <4 x i16>] %b, 2 + %4 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2) + %tmp1 = getelementptr i16* %a, i32 %inc + store i16* %tmp1, i16** %ptr + ret void +} + +define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) { +; CHECK-LABEL: test_vst4_lane_fx_update +; CHECK: st4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32 + %1 = extractvalue [4 x <2 x double>] %b.coerce, 0 + %2 = extractvalue [4 x <2 x double>] %b.coerce, 1 + %3 = extractvalue [4 x <2 x double>] %b.coerce, 2 + %4 = extractvalue [4 x <2 x double>] %b.coerce, 3 + %5 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8) + %tmp1 = getelementptr double* %a, i32 4 + store double* %tmp1, double** %ptr + ret void +} + + +define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) { +; CHECK-LABEL: test_vst4_lane_reg_update +; CHECK: st4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}} + %1 = extractvalue [4 x <2 x float>] %b.coerce, 0 + %2 = extractvalue [4 x <2 x float>] %b.coerce, 1 + %3 = extractvalue [4 x <2 x float>] %b.coerce, 2 + %4 = extractvalue [4 x <2 x float>] %b.coerce, 3 + %5 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4) + %tmp1 = getelementptr float* %a, i32 %inc + store float* %tmp1, float** %ptr + ret void +} + +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) +declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) +declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) +declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32) diff --git a/test/CodeGen/AArch64/neon-simd-shift.ll b/test/CodeGen/AArch64/neon-simd-shift.ll new file mode 100644 index 0000000000000..fd762656e56ed --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-shift.ll @@ -0,0 +1,1556 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) { +; CHECK: test_vshr_n_s8 +; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <8 x i8> %vshr_n +} + +define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) { +; CHECK: test_vshr_n_s16 +; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> + ret <4 x i16> %vshr_n +} + +define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) { +; CHECK: test_vshr_n_s32 +; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3> + ret <2 x i32> %vshr_n +} + +define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) { +; CHECK: test_vshrq_n_s8 +; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <16 x i8> %vshr_n +} + +define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) { +; CHECK: test_vshrq_n_s16 +; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %vshr_n +} + +define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) { +; CHECK: test_vshrq_n_s32 +; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3> + ret <4 x i32> %vshr_n +} + +define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) { +; CHECK: test_vshrq_n_s64 +; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3> + ret <2 x i64> %vshr_n +} + +define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) { +; CHECK: test_vshr_n_u8 +; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <8 x i8> %vshr_n +} + +define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) { +; CHECK: test_vshr_n_u16 +; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3> + ret <4 x i16> %vshr_n +} + +define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) { +; CHECK: test_vshr_n_u32 +; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3> + ret <2 x i32> %vshr_n +} + +define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) { +; CHECK: test_vshrq_n_u8 +; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <16 x i8> %vshr_n +} + +define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) { +; CHECK: test_vshrq_n_u16 +; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %vshr_n +} + +define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) { +; CHECK: test_vshrq_n_u32 +; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3> + ret <4 x i32> %vshr_n +} + +define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) { +; CHECK: test_vshrq_n_u64 +; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3> + ret <2 x i64> %vshr_n +} + +define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsra_n_s8 +; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + %1 = add <8 x i8> %vsra_n, %a + ret <8 x i8> %1 +} + +define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsra_n_s16 +; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3> + %1 = add <4 x i16> %vsra_n, %a + ret <4 x i16> %1 +} + +define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsra_n_s32 +; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3> + %1 = add <2 x i32> %vsra_n, %a + ret <2 x i32> %1 +} + +define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsraq_n_s8 +; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + %1 = add <16 x i8> %vsra_n, %a + ret <16 x i8> %1 +} + +define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsraq_n_s16 +; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + %1 = add <8 x i16> %vsra_n, %a + ret <8 x i16> %1 +} + +define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsraq_n_s32 +; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3> + %1 = add <4 x i32> %vsra_n, %a + ret <4 x i32> %1 +} + +define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsraq_n_s64 +; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3> + %1 = add <2 x i64> %vsra_n, %a + ret <2 x i64> %1 +} + +define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsra_n_u8 +; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + %1 = add <8 x i8> %vsra_n, %a + ret <8 x i8> %1 +} + +define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsra_n_u16 +; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3> + %1 = add <4 x i16> %vsra_n, %a + ret <4 x i16> %1 +} + +define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsra_n_u32 +; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3> + %1 = add <2 x i32> %vsra_n, %a + ret <2 x i32> %1 +} + +define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsraq_n_u8 +; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + %1 = add <16 x i8> %vsra_n, %a + ret <16 x i8> %1 +} + +define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsraq_n_u16 +; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + %1 = add <8 x i16> %vsra_n, %a + ret <8 x i16> %1 +} + +define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsraq_n_u32 +; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3> + %1 = add <4 x i32> %vsra_n, %a + ret <4 x i32> %1 +} + +define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsraq_n_u64 +; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3> + %1 = add <2 x i64> %vsra_n, %a + ret <2 x i64> %1 +} + +define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) { +; CHECK: test_vrshr_n_s8 +; CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %a, i32 3) + ret <8 x i8> %vrshr_n +} + + +define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) { +; CHECK: test_vrshr_n_s16 +; CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %a, i32 3) + ret <4 x i16> %vrshr_n +} + + +define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) { +; CHECK: test_vrshr_n_s32 +; CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %a, i32 3) + ret <2 x i32> %vrshr_n +} + + +define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) { +; CHECK: test_vrshrq_n_s8 +; CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %a, i32 3) + ret <16 x i8> %vrshr_n +} + + +define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) { +; CHECK: test_vrshrq_n_s16 +; CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %a, i32 3) + ret <8 x i16> %vrshr_n +} + + +define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) { +; CHECK: test_vrshrq_n_s32 +; CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %a, i32 3) + ret <4 x i32> %vrshr_n +} + + +define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) { +; CHECK: test_vrshrq_n_s64 +; CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %a, i32 3) + ret <2 x i64> %vrshr_n +} + + +define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) { +; CHECK: test_vrshr_n_u8 +; CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %a, i32 3) + ret <8 x i8> %vrshr_n +} + + +define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) { +; CHECK: test_vrshr_n_u16 +; CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %a, i32 3) + ret <4 x i16> %vrshr_n +} + + +define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) { +; CHECK: test_vrshr_n_u32 +; CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %a, i32 3) + ret <2 x i32> %vrshr_n +} + + +define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) { +; CHECK: test_vrshrq_n_u8 +; CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %a, i32 3) + ret <16 x i8> %vrshr_n +} + + +define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) { +; CHECK: test_vrshrq_n_u16 +; CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %a, i32 3) + ret <8 x i16> %vrshr_n +} + + +define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) { +; CHECK: test_vrshrq_n_u32 +; CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %a, i32 3) + ret <4 x i32> %vrshr_n +} + + +define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) { +; CHECK: test_vrshrq_n_u64 +; CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %a, i32 3) + ret <2 x i64> %vrshr_n +} + + +define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vrsra_n_s8 +; CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %1 = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %b, i32 3) + %vrsra_n = add <8 x i8> %1, %a + ret <8 x i8> %vrsra_n +} + +define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vrsra_n_s16 +; CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %1 = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %b, i32 3) + %vrsra_n = add <4 x i16> %1, %a + ret <4 x i16> %vrsra_n +} + +define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vrsra_n_s32 +; CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %1 = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %b, i32 3) + %vrsra_n = add <2 x i32> %1, %a + ret <2 x i32> %vrsra_n +} + +define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vrsraq_n_s8 +; CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %1 = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %b, i32 3) + %vrsra_n = add <16 x i8> %1, %a + ret <16 x i8> %vrsra_n +} + +define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vrsraq_n_s16 +; CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %1 = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %b, i32 3) + %vrsra_n = add <8 x i16> %1, %a + ret <8 x i16> %vrsra_n +} + +define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vrsraq_n_s32 +; CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %1 = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %b, i32 3) + %vrsra_n = add <4 x i32> %1, %a + ret <4 x i32> %vrsra_n +} + +define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vrsraq_n_s64 +; CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %1 = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %b, i32 3) + %vrsra_n = add <2 x i64> %1, %a + ret <2 x i64> %vrsra_n +} + +define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vrsra_n_u8 +; CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %1 = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %b, i32 3) + %vrsra_n = add <8 x i8> %1, %a + ret <8 x i8> %vrsra_n +} + +define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vrsra_n_u16 +; CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %1 = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %b, i32 3) + %vrsra_n = add <4 x i16> %1, %a + ret <4 x i16> %vrsra_n +} + +define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vrsra_n_u32 +; CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %1 = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %b, i32 3) + %vrsra_n = add <2 x i32> %1, %a + ret <2 x i32> %vrsra_n +} + +define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vrsraq_n_u8 +; CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %1 = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %b, i32 3) + %vrsra_n = add <16 x i8> %1, %a + ret <16 x i8> %vrsra_n +} + +define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vrsraq_n_u16 +; CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %1 = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %b, i32 3) + %vrsra_n = add <8 x i16> %1, %a + ret <8 x i16> %vrsra_n +} + +define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vrsraq_n_u32 +; CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %1 = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %b, i32 3) + %vrsra_n = add <4 x i32> %1, %a + ret <4 x i32> %vrsra_n +} + +define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vrsraq_n_u64 +; CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %1 = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %b, i32 3) + %vrsra_n = add <2 x i64> %1, %a + ret <2 x i64> %vrsra_n +} + +define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsri_n_s8 +; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) + ret <8 x i8> %vsri_n +} + + +define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsri_n_s16 +; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3) + ret <4 x i16> %vsri +} + + +define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsri_n_s32 +; CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vsri = tail call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3) + ret <2 x i32> %vsri +} + + +define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsriq_n_s8 +; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) + ret <16 x i8> %vsri_n +} + + +define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsriq_n_s16 +; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3) + ret <8 x i16> %vsri +} + + +define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsriq_n_s32 +; CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vsri = tail call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3) + ret <4 x i32> %vsri +} + + +define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsriq_n_s64 +; CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vsri = tail call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3) + ret <2 x i64> %vsri +} + +define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsri_n_p8 +; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) + ret <8 x i8> %vsri_n +} + +define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsri_n_p16 +; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15 + %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15) + ret <4 x i16> %vsri +} + +define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsriq_n_p8 +; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) + ret <16 x i8> %vsri_n +} + +define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsriq_n_p16 +; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15 + %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15) + ret <8 x i16> %vsri +} + +define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsli_n_s8 +; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) + ret <8 x i8> %vsli_n +} + +define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsli_n_s16 +; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3) + ret <4 x i16> %vsli +} + +define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsli_n_s32 +; CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vsli = tail call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3) + ret <2 x i32> %vsli +} + +define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsliq_n_s8 +; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) + ret <16 x i8> %vsli_n +} + +define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsliq_n_s16 +; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3) + ret <8 x i16> %vsli +} + +define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsliq_n_s32 +; CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vsli = tail call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3) + ret <4 x i32> %vsli +} + +define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsliq_n_s64 +; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vsli = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3) + ret <2 x i64> %vsli +} + +define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsli_n_p8 +; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) + ret <8 x i8> %vsli_n +} + +define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsli_n_p16 +; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15 + %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15) + ret <4 x i16> %vsli +} + +define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsliq_n_p8 +; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) + ret <16 x i8> %vsli_n +} + +define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsliq_n_p16 +; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15 + %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15) + ret <8 x i16> %vsli +} + +define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) { +; CHECK: test_vqshl_n_s8 +; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vqshl = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>) + ret <8 x i8> %vqshl +} + + +define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) { +; CHECK: test_vqshl_n_s16 +; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>) + ret <4 x i16> %vqshl +} + + +define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) { +; CHECK: test_vqshl_n_s32 +; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>) + ret <2 x i32> %vqshl +} + + +define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) { +; CHECK: test_vqshlq_n_s8 +; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>) + ret <16 x i8> %vqshl_n +} + + +define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) { +; CHECK: test_vqshlq_n_s16 +; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>) + ret <8 x i16> %vqshl +} + + +define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) { +; CHECK: test_vqshlq_n_s32 +; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>) + ret <4 x i32> %vqshl +} + + +define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) { +; CHECK: test_vqshlq_n_s64 +; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>) + ret <2 x i64> %vqshl +} + + +define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) { +; CHECK: test_vqshl_n_u8 +; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>) + ret <8 x i8> %vqshl_n +} + + +define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) { +; CHECK: test_vqshl_n_u16 +; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>) + ret <4 x i16> %vqshl +} + + +define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) { +; CHECK: test_vqshl_n_u32 +; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>) + ret <2 x i32> %vqshl +} + + +define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) { +; CHECK: test_vqshlq_n_u8 +; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>) + ret <16 x i8> %vqshl_n +} + + +define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) { +; CHECK: test_vqshlq_n_u16 +; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>) + ret <8 x i16> %vqshl +} + + +define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) { +; CHECK: test_vqshlq_n_u32 +; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>) + ret <4 x i32> %vqshl +} + + +define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) { +; CHECK: test_vqshlq_n_u64 +; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>) + ret <2 x i64> %vqshl +} + +define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) { +; CHECK: test_vqshlu_n_s8 +; CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vqshlu = tail call <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8> %a, i32 3) + ret <8 x i8> %vqshlu +} + + +define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) { +; CHECK: test_vqshlu_n_s16 +; CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vqshlu = tail call <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16> %a, i32 3) + ret <4 x i16> %vqshlu +} + + +define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) { +; CHECK: test_vqshlu_n_s32 +; CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vqshlu = tail call <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32> %a, i32 3) + ret <2 x i32> %vqshlu +} + + +define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) { +; CHECK: test_vqshluq_n_s8 +; CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vqshlu = tail call <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8> %a, i32 3) + ret <16 x i8> %vqshlu +} + + +define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) { +; CHECK: test_vqshluq_n_s16 +; CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vqshlu = tail call <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16> %a, i32 3) + ret <8 x i16> %vqshlu +} + + +define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) { +; CHECK: test_vqshluq_n_s32 +; CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vqshlu = tail call <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32> %a, i32 3) + ret <4 x i32> %vqshlu +} + + +define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) { +; CHECK: test_vqshluq_n_s64 +; CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vqshlu = tail call <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64> %a, i32 3) + ret <2 x i64> %vqshlu +} + + +define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) { +; CHECK: test_vshrn_n_s16 +; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + ret <8 x i8> %vshrn_n +} + +define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) { +; CHECK: test_vshrn_n_s32 +; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9> + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + ret <4 x i16> %vshrn_n +} + +define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) { +; CHECK: test_vshrn_n_s64 +; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %1 = ashr <2 x i64> %a, <i64 19, i64 19> + %vshrn_n = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %vshrn_n +} + +define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) { +; CHECK: test_vshrn_n_u16 +; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + ret <8 x i8> %vshrn_n +} + +define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) { +; CHECK: test_vshrn_n_u32 +; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9> + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + ret <4 x i16> %vshrn_n +} + +define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) { +; CHECK: test_vshrn_n_u64 +; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %1 = lshr <2 x i64> %a, <i64 19, i64 19> + %vshrn_n = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %vshrn_n +} + +define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vshrn_high_n_s16 +; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + %2 = bitcast <8 x i8> %a to <1 x i64> + %3 = bitcast <8 x i8> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1> + %4 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %4 +} + +define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vshrn_high_n_s32 +; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9> + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + %2 = bitcast <4 x i16> %a to <1 x i64> + %3 = bitcast <4 x i16> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1> + %4 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %4 +} + +define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vshrn_high_n_s64 +; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %2 = ashr <2 x i64> %b, <i64 19, i64 19> + %vshrn_n = trunc <2 x i64> %2 to <2 x i32> + %3 = bitcast <2 x i32> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1> + %4 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %4 +} + +define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vshrn_high_n_u16 +; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + %2 = bitcast <8 x i8> %a to <1 x i64> + %3 = bitcast <8 x i8> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1> + %4 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %4 +} + +define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vshrn_high_n_u32 +; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9> + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + %2 = bitcast <4 x i16> %a to <1 x i64> + %3 = bitcast <4 x i16> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1> + %4 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %4 +} + +define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vshrn_high_n_u64 +; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %2 = lshr <2 x i64> %b, <i64 19, i64 19> + %vshrn_n = trunc <2 x i64> %2 to <2 x i32> + %3 = bitcast <2 x i32> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1> + %4 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %4 +} + +define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) { +; CHECK: test_vqshrun_n_s16 +; CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %a, i32 3) + ret <8 x i8> %vqshrun +} + + +define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) { +; CHECK: test_vqshrun_n_s32 +; CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %a, i32 9) + ret <4 x i16> %vqshrun +} + +define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) { +; CHECK: test_vqshrun_n_s64 +; CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %a, i32 19) + ret <2 x i32> %vqshrun +} + +define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqshrun_high_n_s16 +; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqshrun_high_n_s32 +; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqshrun_high_n_s64 +; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) { +; CHECK: test_vrshrn_n_s16 +; CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %a, i32 3) + ret <8 x i8> %vrshrn +} + + +define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) { +; CHECK: test_vrshrn_n_s32 +; CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %a, i32 9) + ret <4 x i16> %vrshrn +} + + +define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) { +; CHECK: test_vrshrn_n_s64 +; CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %a, i32 19) + ret <2 x i32> %vrshrn +} + +define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vrshrn_high_n_s16 +; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vrshrn_high_n_s32 +; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vrshrn_high_n_s64 +; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) { +; CHECK: test_vqrshrun_n_s16 +; CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %a, i32 3) + ret <8 x i8> %vqrshrun +} + +define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) { +; CHECK: test_vqrshrun_n_s32 +; CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %a, i32 9) + ret <4 x i16> %vqrshrun +} + +define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) { +; CHECK: test_vqrshrun_n_s64 +; CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %a, i32 19) + ret <2 x i32> %vqrshrun +} + +define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqrshrun_high_n_s16 +; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqrshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqrshrun_high_n_s32 +; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqrshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqrshrun_high_n_s64 +; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqrshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) { +; CHECK: test_vqshrn_n_s16 +; CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %a, i32 3) + ret <8 x i8> %vqshrn +} + + +define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) { +; CHECK: test_vqshrn_n_s32 +; CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %a, i32 9) + ret <4 x i16> %vqshrn +} + + +define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) { +; CHECK: test_vqshrn_n_s64 +; CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %a, i32 19) + ret <2 x i32> %vqshrn +} + + +define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) { +; CHECK: test_vqshrn_n_u16 +; CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %a, i32 3) + ret <8 x i8> %vqshrn +} + + +define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) { +; CHECK: test_vqshrn_n_u32 +; CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %a, i32 9) + ret <4 x i16> %vqshrn +} + + +define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) { +; CHECK: test_vqshrn_n_u64 +; CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %a, i32 19) + ret <2 x i32> %vqshrn +} + + +define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqshrn_high_n_s16 +; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqshrn_high_n_s32 +; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqshrn_high_n_s64 +; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqshrn_high_n_u16 +; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqshrn_high_n_u32 +; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqshrn_high_n_u64 +; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) { +; CHECK: test_vqrshrn_n_s16 +; CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %a, i32 3) + ret <8 x i8> %vqrshrn +} + + +define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) { +; CHECK: test_vqrshrn_n_s32 +; CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %a, i32 9) + ret <4 x i16> %vqrshrn +} + + +define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) { +; CHECK: test_vqrshrn_n_s64 +; CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %a, i32 19) + ret <2 x i32> %vqrshrn +} + + +define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) { +; CHECK: test_vqrshrn_n_u16 +; CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %a, i32 3) + ret <8 x i8> %vqrshrn +} + + +define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) { +; CHECK: test_vqrshrn_n_u32 +; CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %a, i32 9) + ret <4 x i16> %vqrshrn +} + + +define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) { +; CHECK: test_vqrshrn_n_u64 +; CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %a, i32 19) + ret <2 x i32> %vqrshrn +} + + +define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqrshrn_high_n_s16 +; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqrshrn_high_n_s32 +; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqrshrn_high_n_s64 +; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqrshrn_high_n_u16 +; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqrshrn_high_n_u32 +; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqrshrn_high_n_u64 +; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) { +; CHECK: test_vcvt_n_f32_s32 +; CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 + %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 31) + ret <2 x float> %vcvt +} + +define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) { +; CHECK: test_vcvtq_n_f32_s32 +; CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 + %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 31) + ret <4 x float> %vcvt +} + +define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) { +; CHECK: test_vcvtq_n_f64_s64 +; CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 + %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 50) + ret <2 x double> %vcvt +} + +define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) { +; CHECK: test_vcvt_n_f32_u32 +; CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 + %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 31) + ret <2 x float> %vcvt +} + +define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) { +; CHECK: test_vcvtq_n_f32_u32 +; CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 + %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 31) + ret <4 x float> %vcvt +} + +define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) { +; CHECK: test_vcvtq_n_f64_u64 +; CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 + %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 50) + ret <2 x double> %vcvt +} + +define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) { +; CHECK: test_vcvt_n_s32_f32 +; CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 + %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 31) + ret <2 x i32> %vcvt +} + +define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) { +; CHECK: test_vcvtq_n_s32_f32 +; CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 + %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 31) + ret <4 x i32> %vcvt +} + +define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) { +; CHECK: test_vcvtq_n_s64_f64 +; CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 + %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %a, i32 50) + ret <2 x i64> %vcvt +} + +define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) { +; CHECK: test_vcvt_n_u32_f32 +; CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31 + %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 31) + ret <2 x i32> %vcvt +} + +define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) { +; CHECK: test_vcvt_n_u32_f32 +; CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31 + %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 31) + ret <4 x i32> %vcvt +} + +define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) { +; CHECK: test_vcvtq_n_u64_f64 +; CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50 + %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %a, i32 50) + ret <2 x i64> %vcvt +} + +declare <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32>, i32) + +declare <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8>, i32) + +declare <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16>, i32) + +declare <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32>, i32) + +declare <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32>, i32) + +declare <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8>, i32) + +declare <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16>, i32) + +declare <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32>, i32) + +declare <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8>, <8 x i8>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16>, <4 x i16>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32>, <2 x i32>, i32) + +declare <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8>, <16 x i8>, i32) + +declare <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16>, <8 x i16>, i32) + +declare <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32>, <4 x i32>, i32) + +declare <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64>, <2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) + +declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) + +declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) + +declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) + +declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32>, i32) + +declare <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8>, i32) + +declare <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16>, i32) + +declare <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32>, i32) + +declare <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64>, i32) + +declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) + +declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) + +declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) + +declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) + +declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) + +declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) + +declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) + +declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) + +declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) + +declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) + +declare <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64>, i32) + +declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) + +declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) + +declare <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) + +declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) + +declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) + +declare <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) + +declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) + +declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) + +declare <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) + +declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) + +declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) + +declare <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) + +define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvt_n_s64_f64 +; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvt_n_u64_f64 +; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64) + ret <1 x i64> %1 +} + +define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) { +; CHECK-LABEL: test_vcvt_n_f64_s64 +; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64) + ret <1 x double> %1 +} + +define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) { +; CHECK-LABEL: test_vcvt_n_f64_u64 +; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64) + ret <1 x double> %1 +} + +declare <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32) +declare <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32) +declare <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32) +declare <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll new file mode 100644 index 0000000000000..8eac1e88c4a52 --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-tbl.ll @@ -0,0 +1,828 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8>, <8 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) + +declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8>, <16 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) + +define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtbl1_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl11.i +} + +define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) { +; CHECK: test_vqtbl1_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + ret <8 x i8> %vtbl1.i +} + +define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl2_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 + %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl17.i +} + +define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl2_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl2.i +} + +define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl3_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + ret <8 x i8> %vtbl212.i +} + +define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl3_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl3.i +} + +define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl4_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + ret <8 x i8> %vtbl216.i +} + +define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl4_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl4.i +} + +define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vqtbl1q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %vtbl1.i +} + +define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl2q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl2.i +} + +define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl3q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl3.i +} + +define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl4q_s8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl4.i +} + +define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vtbx1_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx2_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + ret <8 x i8> %vtbx17.i +} + +define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx3_s8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx4_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + ret <8 x i8> %vtbx216.i +} + +define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { +; CHECK: test_vqtbx1_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + ret <8 x i8> %vtbx1.i +} + +define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx2_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx2.i +} + +define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx3_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx3.i +} + +define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx4_s8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx4.i +} + +define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vqtbx1q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + ret <16 x i8> %vtbx1.i +} + +define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx2q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx2.i +} + +define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx3q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx3.i +} + +define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx4q_s8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx4.i +} + +define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtbl1_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl11.i +} + +define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) { +; CHECK: test_vqtbl1_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + ret <8 x i8> %vtbl1.i +} + +define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl2_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 + %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl17.i +} + +define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl2_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl2.i +} + +define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl3_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + ret <8 x i8> %vtbl212.i +} + +define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl3_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl3.i +} + +define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl4_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + ret <8 x i8> %vtbl216.i +} + +define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl4_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl4.i +} + +define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vqtbl1q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %vtbl1.i +} + +define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl2q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl2.i +} + +define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl3q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl3.i +} + +define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl4q_u8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl4.i +} + +define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vtbx1_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx2_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + ret <8 x i8> %vtbx17.i +} + +define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx3_u8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx4_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + ret <8 x i8> %vtbx216.i +} + +define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { +; CHECK: test_vqtbx1_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + ret <8 x i8> %vtbx1.i +} + +define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx2_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx2.i +} + +define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx3_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx3.i +} + +define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx4_u8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx4.i +} + +define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vqtbx1q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + ret <16 x i8> %vtbx1.i +} + +define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx2q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx2.i +} + +define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx3q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx3.i +} + +define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx4q_u8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx4.i +} + +define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vtbl1_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl11.i +} + +define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) { +; CHECK: test_vqtbl1_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b) + ret <8 x i8> %vtbl1.i +} + +define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl2_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1 + %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b) + ret <8 x i8> %vtbl17.i +} + +define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl2_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl2.i +} + +define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl3_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b) + ret <8 x i8> %vtbl212.i +} + +define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl3_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl3.i +} + +define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vtbl4_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3 + %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b) + ret <8 x i8> %vtbl216.i +} + +define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) { +; CHECK: test_vqtbl4_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b) + ret <8 x i8> %vtbl4.i +} + +define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vqtbl1q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %vtbl1.i +} + +define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl2q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1 + %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl2.i +} + +define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl3q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2 + %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl3.i +} + +define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) { +; CHECK: test_vqtbl4q_p8: +; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0 + %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1 + %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2 + %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3 + %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b) + ret <16 x i8> %vtbl4.i +} + +define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK: test_vtbx1_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx2_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1 + %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c) + ret <8 x i8> %vtbx17.i +} + +define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx3_p8: +; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2 + %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c) + %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24> + %1 = sext <8 x i1> %0 to <8 x i8> + %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i) + ret <8 x i8> %vbsl.i +} + +define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vtbx4_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3 + %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c) + ret <8 x i8> %vtbx216.i +} + +define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) { +; CHECK: test_vqtbx1_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) + ret <8 x i8> %vtbx1.i +} + +define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx2_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx2.i +} + +define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx3_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx3.i +} + +define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) { +; CHECK: test_vqtbx4_p8: +; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c) + ret <8 x i8> %vtbx4.i +} + +define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK: test_vqtbx1q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) + ret <16 x i8> %vtbx1.i +} + +define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx2q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1 + %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx2.i +} + +define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx3q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2 + %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx3.i +} + +define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) { +; CHECK: test_vqtbx4q_p8: +; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b +entry: + %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3 + %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c) + ret <16 x i8> %vtbx4.i +} + diff --git a/test/CodeGen/AArch64/neon-simd-vget.ll b/test/CodeGen/AArch64/neon-simd-vget.ll new file mode 100644 index 0000000000000..6474499e4ff1d --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-vget.ll @@ -0,0 +1,225 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @test_vget_high_s8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_high_s8: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_high_s16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_s16: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_high_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_high_s32: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_high_s64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_high_s64: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1> + ret <1 x i64> %shuffle.i +} + +define <8 x i8> @test_vget_high_u8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_high_u8: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_high_u16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_u16: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_high_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_high_u32: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_high_u64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_high_u64: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1> + ret <1 x i64> %shuffle.i +} + +define <1 x i64> @test_vget_high_p64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_high_p64: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1> + ret <1 x i64> %shuffle.i +} + +define <4 x i16> @test_vget_high_f16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_f16: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + ret <4 x i16> %shuffle.i +} + +define <2 x float> @test_vget_high_f32(<4 x float> %a) { +; CHECK-LABEL: test_vget_high_f32: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3> + ret <2 x float> %shuffle.i +} + +define <8 x i8> @test_vget_high_p8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_high_p8: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_high_p16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_p16: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + ret <4 x i16> %shuffle.i +} + +define <1 x double> @test_vget_high_f64(<2 x double> %a) { +; CHECK-LABEL: test_vget_high_f64: +; CHECK: dup d0, {{v[0-9]+}}.d[1] +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1> + ret <1 x double> %shuffle.i +} + +define <8 x i8> @test_vget_low_s8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_low_s8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_low_s16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_s16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_low_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_low_s32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_low_s64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_low_s64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer + ret <1 x i64> %shuffle.i +} + +define <8 x i8> @test_vget_low_u8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_low_u8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_low_u16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_u16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_low_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_low_u32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_low_u64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_low_u64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer + ret <1 x i64> %shuffle.i +} + +define <1 x i64> @test_vget_low_p64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_low_p64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer + ret <1 x i64> %shuffle.i +} + +define <4 x i16> @test_vget_low_f16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_f16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i16> %shuffle.i +} + +define <2 x float> @test_vget_low_f32(<4 x float> %a) { +; CHECK-LABEL: test_vget_low_f32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1> + ret <2 x float> %shuffle.i +} + +define <8 x i8> @test_vget_low_p8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_low_p8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_low_p16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_p16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i16> %shuffle.i +} + +define <1 x double> @test_vget_low_f64(<2 x double> %a) { +; CHECK-LABEL: test_vget_low_f64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer + ret <1 x double> %shuffle.i +} diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll index 77bf691cbcbd5..6ec4b19a12042 100644 --- a/test/CodeGen/AArch64/pic-eh-stubs.ll +++ b/test/CodeGen/AArch64/pic-eh-stubs.ll @@ -57,4 +57,4 @@ declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone declare i8* @__cxa_begin_catch(i8*) -declare void @__cxa_end_catch()
\ No newline at end of file +declare void @__cxa_end_catch() diff --git a/test/CodeGen/AArch64/regress-bitcast-formals.ll b/test/CodeGen/AArch64/regress-bitcast-formals.ll index 28dc9a7e2515d..9655f90d826d0 100644 --- a/test/CodeGen/AArch64/regress-bitcast-formals.ll +++ b/test/CodeGen/AArch64/regress-bitcast-formals.ll @@ -4,7 +4,7 @@ ; actually capable of that (the test was omitted from LowerFormalArguments). define void @test_bitcast_lower(<2 x i32> %a) { -; CHECK: test_bitcast_lower: +; CHECK-LABEL: test_bitcast_lower: ret void ; CHECK: ret diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll new file mode 100644 index 0000000000000..cb8432a7e4e4a --- /dev/null +++ b/test/CodeGen/AArch64/regress-fp128-livein.ll @@ -0,0 +1,17 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s + +; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB, +; causing a crash during live range calc. +define void @fp128_livein(i64 %a) { + %tobool = icmp ne i64 %a, 0 + %conv = zext i1 %tobool to i32 + %conv2 = sitofp i32 %conv to fp128 + %conv6 = sitofp i32 %conv to double + %call3 = tail call i32 @g(fp128 %conv2) + %call8 = tail call i32 @h(double %conv6) + ret void +} + +declare i32 @f() +declare i32 @g(fp128) +declare i32 @h(double) diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll index 8d5485cae4c8c..053249c6855f0 100644 --- a/test/CodeGen/AArch64/regress-tail-livereg.ll +++ b/test/CodeGen/AArch64/regress-tail-livereg.ll @@ -4,7 +4,7 @@ declare void @bar() define void @foo() { -; CHECK: foo: +; CHECK-LABEL: foo: %func = load void()** @var ; Calling a function encourages @foo to use a callee-saved register, @@ -16,4 +16,4 @@ define void @foo() { tail call void %func() ; CHECK: br {{x([0-79]|1[0-8])}} ret void -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll index e54552fd8edf2..ff77fb4e48f7e 100644 --- a/test/CodeGen/AArch64/regress-tblgen-chains.ll +++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -12,7 +12,7 @@ declare void @bar(i8*) define i64 @test_chains() { -; CHECK: test_chains: +; CHECK-LABEL: test_chains: %locvar = alloca i8 diff --git a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll index 980e2ffef9017..0ef981819ec36 100644 --- a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll +++ b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll @@ -4,8 +4,23 @@ declare void @bar() define void @test_w29_reserved() { -; CHECK: test_w29_reserved: +; CHECK-LABEL: test_w29_reserved: +; CHECK: .cfi_startproc +; CHECK: .cfi_def_cfa sp, 96 ; CHECK: add x29, sp, #{{[0-9]+}} +; CHECK: .cfi_def_cfa x29, 16 +; CHECK: .cfi_offset x30, -8 +; CHECK: .cfi_offset x29, -16 +; CHECK: .cfi_offset x28, -24 +; CHECK: .cfi_offset x27, -32 +; CHECK: .cfi_offset x26, -40 +; CHECK: .cfi_offset x25, -48 +; CHECK: .cfi_offset x24, -56 +; CHECK: .cfi_offset x23, -64 +; CHECK: .cfi_offset x22, -72 +; CHECK: .cfi_offset x21, -80 +; CHECK: .cfi_offset x20, -88 +; CHECK: .cfi_offset x19, -96 %val1 = load volatile i32* @var %val2 = load volatile i32* @var diff --git a/test/CodeGen/AArch64/returnaddr.ll b/test/CodeGen/AArch64/returnaddr.ll new file mode 100644 index 0000000000000..c85f9ec4ffd5b --- /dev/null +++ b/test/CodeGen/AArch64/returnaddr.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s + +define i8* @rt0(i32 %x) nounwind readnone { +entry: +; CHECK-LABEL: rt0: +; CHECK: mov x0, x30 + %0 = tail call i8* @llvm.returnaddress(i32 0) + ret i8* %0 +} + +define i8* @rt2() nounwind readnone { +entry: +; CHECK-LABEL: rt2: +; CHECK: ldr x[[reg:[0-9]+]], [x29] +; CHECK: ldr x[[reg]], [x[[reg]]] +; CHECK: ldr x0, [x[[reg]], #8] + %0 = tail call i8* @llvm.returnaddress(i32 2) + ret i8* %0 +} + +declare i8* @llvm.returnaddress(i32) nounwind readnone diff --git a/test/CodeGen/AArch64/setcc-takes-i32.ll b/test/CodeGen/AArch64/setcc-takes-i32.ll index d2eb77ab1b54a..bd79685d34b46 100644 --- a/test/CodeGen/AArch64/setcc-takes-i32.ll +++ b/test/CodeGen/AArch64/setcc-takes-i32.ll @@ -12,11 +12,11 @@ declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) define i64 @test_select(i64 %lhs, i64 %rhs) { -; CHECK: test_select: +; CHECK-LABEL: test_select: %res = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %lhs, i64 %rhs) %flag = extractvalue {i64, i1} %res, 1 %retval = select i1 %flag, i64 %lhs, i64 %rhs ret i64 %retval ; CHECK: ret -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll index a1ec618b03ba6..20f1062a44dc9 100644 --- a/test/CodeGen/AArch64/sibling-call.ll +++ b/test/CodeGen/AArch64/sibling-call.ll @@ -5,7 +5,7 @@ declare void @callee_stack8([8 x i32], i64) declare void @callee_stack16([8 x i32], i64, i64) define void @caller_to0_from0() nounwind { -; CHECK: caller_to0_from0: +; CHECK-LABEL: caller_to0_from0: ; CHECK-NEXT: // BB tail call void @callee_stack0() ret void @@ -13,7 +13,7 @@ define void @caller_to0_from0() nounwind { } define void @caller_to0_from8([8 x i32], i64) nounwind{ -; CHECK: caller_to0_from8: +; CHECK-LABEL: caller_to0_from8: ; CHECK-NEXT: // BB tail call void @callee_stack0() @@ -22,7 +22,7 @@ define void @caller_to0_from8([8 x i32], i64) nounwind{ } define void @caller_to8_from0() { -; CHECK: caller_to8_from0: +; CHECK-LABEL: caller_to8_from0: ; Caller isn't going to clean up any extra stack we allocate, so it ; can't be a tail call. @@ -32,7 +32,7 @@ define void @caller_to8_from0() { } define void @caller_to8_from8([8 x i32], i64 %a) { -; CHECK: caller_to8_from8: +; CHECK-LABEL: caller_to8_from8: ; CHECK-NOT: sub sp, sp, ; This should reuse our stack area for the 42 @@ -43,7 +43,7 @@ define void @caller_to8_from8([8 x i32], i64 %a) { } define void @caller_to16_from8([8 x i32], i64 %a) { -; CHECK: caller_to16_from8: +; CHECK-LABEL: caller_to16_from8: ; Shouldn't be a tail call: we can't use SP+8 because our caller might ; have something there. This may sound obvious but implementation does @@ -54,7 +54,7 @@ define void @caller_to16_from8([8 x i32], i64 %a) { } define void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) { -; CHECK: caller_to8_from24: +; CHECK-LABEL: caller_to8_from24: ; CHECK-NOT: sub sp, sp ; Reuse our area, putting "42" at incoming sp @@ -65,7 +65,7 @@ define void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) { } define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) { -; CHECK: caller_to16_from16: +; CHECK-LABEL: caller_to16_from16: ; CHECK-NOT: sub sp, sp, ; Here we want to make sure that both loads happen before the stores: @@ -85,13 +85,13 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) { @func = global void(i32)* null define void @indirect_tail() { -; CHECK: indirect_tail: +; CHECK-LABEL: indirect_tail: ; CHECK-NOT: sub sp, sp %fptr = load void(i32)** @func tail call void %fptr(i32 42) ret void -; CHECK: movz w0, #42 ; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, #:lo12:func] +; CHECK: movz w0, #42 ; CHECK: br [[FPTR]] -}
\ No newline at end of file +} diff --git a/test/CodeGen/AArch64/sincos-expansion.ll b/test/CodeGen/AArch64/sincos-expansion.ll index c7a392b78c245..4cd44494d5457 100644 --- a/test/CodeGen/AArch64/sincos-expansion.ll +++ b/test/CodeGen/AArch64/sincos-expansion.ll @@ -3,8 +3,8 @@ define float @test_sincos_f32(float %f) { %sin = call float @sinf(float %f) readnone %cos = call float @cosf(float %f) readnone -; CHECK: bl cosf ; CHECK: bl sinf +; CHECK: bl cosf %val = fadd float %sin, %cos ret float %val } @@ -13,8 +13,8 @@ define double @test_sincos_f64(double %f) { %sin = call double @sin(double %f) readnone %cos = call double @cos(double %f) readnone %val = fadd double %sin, %cos -; CHECK: bl cos ; CHECK: bl sin +; CHECK: bl cos ret double %val } @@ -22,8 +22,8 @@ define fp128 @test_sincos_f128(fp128 %f) { %sin = call fp128 @sinl(fp128 %f) readnone %cos = call fp128 @cosl(fp128 %f) readnone %val = fadd fp128 %sin, %cos -; CHECK: bl cosl ; CHECK: bl sinl +; CHECK: bl cosl ret fp128 %val } @@ -32,4 +32,4 @@ declare double @sin(double) readonly declare fp128 @sinl(fp128) readonly declare float @cosf(float) readonly declare double @cos(double) readonly -declare fp128 @cosl(fp128) readonly
\ No newline at end of file +declare fp128 @cosl(fp128) readonly diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll index f323b151ad1e2..81885f1085121 100644 --- a/test/CodeGen/AArch64/tail-call.ll +++ b/test/CodeGen/AArch64/tail-call.ll @@ -5,7 +5,7 @@ declare fastcc void @callee_stack8([8 x i32], i64) declare fastcc void @callee_stack16([8 x i32], i64, i64) define fastcc void @caller_to0_from0() nounwind { -; CHECK: caller_to0_from0: +; CHECK-LABEL: caller_to0_from0: ; CHECK-NEXT: // BB tail call fastcc void @callee_stack0() ret void @@ -13,7 +13,7 @@ define fastcc void @caller_to0_from0() nounwind { } define fastcc void @caller_to0_from8([8 x i32], i64) { -; CHECK: caller_to0_from8: +; CHECK-LABEL: caller_to0_from8: tail call fastcc void @callee_stack0() ret void @@ -22,7 +22,7 @@ define fastcc void @caller_to0_from8([8 x i32], i64) { } define fastcc void @caller_to8_from0() { -; CHECK: caller_to8_from0: +; CHECK-LABEL: caller_to8_from0: ; CHECK: sub sp, sp, #32 ; Key point is that the "42" should go #16 below incoming stack @@ -35,7 +35,7 @@ define fastcc void @caller_to8_from0() { } define fastcc void @caller_to8_from8([8 x i32], i64 %a) { -; CHECK: caller_to8_from8: +; CHECK-LABEL: caller_to8_from8: ; CHECK: sub sp, sp, #16 ; Key point is that the "%a" should go where at SP on entry. @@ -47,7 +47,7 @@ define fastcc void @caller_to8_from8([8 x i32], i64 %a) { } define fastcc void @caller_to16_from8([8 x i32], i64 %a) { -; CHECK: caller_to16_from8: +; CHECK-LABEL: caller_to16_from8: ; CHECK: sub sp, sp, #16 ; Important point is that the call reuses the "dead" argument space @@ -63,7 +63,7 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) { define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) { -; CHECK: caller_to8_from24: +; CHECK-LABEL: caller_to8_from24: ; CHECK: sub sp, sp, #16 ; Key point is that the "%a" should go where at #16 above SP on entry. @@ -76,7 +76,7 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) { define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) { -; CHECK: caller_to16_from16: +; CHECK-LABEL: caller_to16_from16: ; CHECK: sub sp, sp, #16 ; Here we want to make sure that both loads happen before the stores: diff --git a/test/CodeGen/AArch64/tls-dynamic-together.ll b/test/CodeGen/AArch64/tls-dynamic-together.ll index bad2298c8a65b..b5d7d89384442 100644 --- a/test/CodeGen/AArch64/tls-dynamic-together.ll +++ b/test/CodeGen/AArch64/tls-dynamic-together.ll @@ -8,7 +8,7 @@ @general_dynamic_var = external thread_local global i32 define i32 @test_generaldynamic() { -; CHECK: test_generaldynamic: +; CHECK-LABEL: test_generaldynamic: %val = load i32* @general_dynamic_var ret i32 %val diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/tls-dynamics.ll index cdfd11783c239..68c481ce98b6a 100644 --- a/test/CodeGen/AArch64/tls-dynamics.ll +++ b/test/CodeGen/AArch64/tls-dynamics.ll @@ -4,14 +4,14 @@ @general_dynamic_var = external thread_local global i32 define i32 @test_generaldynamic() { -; CHECK: test_generaldynamic: +; CHECK-LABEL: test_generaldynamic: %val = load i32* @general_dynamic_var ret i32 %val ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var -; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var -; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var] +; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var +; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var] ; CHECK: .tlsdesccall general_dynamic_var ; CHECK-NEXT: blr [[CALLEE]] @@ -19,20 +19,20 @@ define i32 @test_generaldynamic() { ; CHECK: ldr w0, [x[[TP]], x0] ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE -; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC -; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL } define i32* @test_generaldynamic_addr() { -; CHECK: test_generaldynamic_addr: +; CHECK-LABEL: test_generaldynamic_addr: ret i32* @general_dynamic_var ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var -; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var -; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var] +; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var +; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var] ; CHECK: .tlsdesccall general_dynamic_var ; CHECK-NEXT: blr [[CALLEE]] @@ -40,8 +40,8 @@ define i32* @test_generaldynamic_addr() { ; CHECK: add x0, [[TP]], x0 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE -; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC -; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL } @@ -49,14 +49,14 @@ define i32* @test_generaldynamic_addr() { @local_dynamic_var = external thread_local(localdynamic) global i32 define i32 @test_localdynamic() { -; CHECK: test_localdynamic: +; CHECK-LABEL: test_localdynamic: %val = load i32* @local_dynamic_var ret i32 %val ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_ -; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_ -; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_] +; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_ +; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_] ; CHECK: .tlsdesccall _TLS_MODULE_BASE_ ; CHECK-NEXT: blr [[CALLEE]] @@ -66,20 +66,20 @@ define i32 @test_localdynamic() { ; CHECK: ldr w0, [x0, [[DTP_OFFSET]]] ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE -; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC -; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL } define i32* @test_localdynamic_addr() { -; CHECK: test_localdynamic_addr: +; CHECK-LABEL: test_localdynamic_addr: ret i32* @local_dynamic_var ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_ -; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_ -; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_] +; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_ +; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_] ; CHECK: .tlsdesccall _TLS_MODULE_BASE_ ; CHECK-NEXT: blr [[CALLEE]] @@ -89,8 +89,8 @@ define i32* @test_localdynamic_addr() { ; CHECK: add x0, x0, [[DTP_OFFSET]] ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE -; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC -; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC +; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL } @@ -101,7 +101,7 @@ define i32* @test_localdynamic_addr() { @local_dynamic_var2 = external thread_local(localdynamic) global i32 define i32 @test_localdynamic_deduplicate() { -; CHECK: test_localdynamic_deduplicate: +; CHECK-LABEL: test_localdynamic_deduplicate: %val = load i32* @local_dynamic_var %val2 = load i32* @local_dynamic_var2 @@ -110,8 +110,8 @@ define i32 @test_localdynamic_deduplicate() { ret i32 %sum ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_ -; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_ -; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_] +; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_ +; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_] ; CHECK: .tlsdesccall _TLS_MODULE_BASE_ ; CHECK-NEXT: blr [[CALLEE]] diff --git a/test/CodeGen/AArch64/tls-execs.ll b/test/CodeGen/AArch64/tls-execs.ll index a665884227936..39ceb9a4795c9 100644 --- a/test/CodeGen/AArch64/tls-execs.ll +++ b/test/CodeGen/AArch64/tls-execs.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s @initial_exec_var = external thread_local(initialexec) global i32 define i32 @test_initial_exec() { -; CHECK: test_initial_exec: +; CHECK-LABEL: test_initial_exec: %val = load i32* @initial_exec_var ; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var @@ -19,7 +19,7 @@ define i32 @test_initial_exec() { } define i32* @test_initial_exec_addr() { -; CHECK: test_initial_exec_addr: +; CHECK-LABEL: test_initial_exec_addr: ret i32* @initial_exec_var ; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var @@ -35,10 +35,10 @@ define i32* @test_initial_exec_addr() { @local_exec_var = thread_local(initialexec) global i32 0 define i32 @test_local_exec() { -; CHECK: test_local_exec: +; CHECK-LABEL: test_local_exec: %val = load i32* @local_exec_var -; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var +; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [A,A,0xa0'A',0x92'A'] ; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var ; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0 ; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]] @@ -50,7 +50,7 @@ define i32 @test_local_exec() { } define i32* @test_local_exec_addr() { -; CHECK: test_local_exec_addr: +; CHECK-LABEL: test_local_exec_addr: ret i32* @local_exec_var ; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll index 65c1fda49e2d4..154bc08c144ce 100644 --- a/test/CodeGen/AArch64/tst-br.ll +++ b/test/CodeGen/AArch64/tst-br.ll @@ -7,7 +7,7 @@ @var64 = global i64 0 define i32 @test_tbz() { -; CHECK: test_tbz: +; CHECK-LABEL: test_tbz: %val = load i32* @var32 %val64 = load i64* @var64 diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll index c5d319eb112b8..f3d376beeb282 100644 --- a/test/CodeGen/AArch64/variadic.ll +++ b/test/CodeGen/AArch64/variadic.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s %va_list = type {i8*, i8*, i8*, i32, i32} @@ -7,21 +8,30 @@ declare void @llvm.va_start(i8*) define void @test_simple(i32 %n, ...) { -; CHECK: test_simple: +; CHECK-LABEL: test_simple: ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]] +; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var ; CHECK: mov x[[FPRBASE:[0-9]+]], sp ; CHECK: str q7, [x[[FPRBASE]], #112] ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]] ; CHECK: str x7, [x[[GPRBASE]], #48] +; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]] +; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var +; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]] +; CHECK-NOFP: str x7, [x[[GPRBASE]], #48] +; CHECK-NOFP-NOT: str q7, +; CHECK-NOFP: str x1, [sp, #[[GPRFROMSP]]] + ; Omit the middle ones ; CHECK: str q0, [sp] ; CHECK: str x1, [sp, #[[GPRFROMSP]]] +; CHECK-NOFP-NOT: str q0, [sp] + %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) -; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var ; CHECK: movn [[VR_OFFS:w[0-9]+]], #127 ; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28] ; CHECK: movn [[GR_OFFS:w[0-9]+]], #55 @@ -33,22 +43,38 @@ define void @test_simple(i32 %n, ...) { ; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]] ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28] +; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #55 +; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24] +; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56 +; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8] +; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]] +; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] + ret void } define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) { -; CHECK: test_fewargs: +; CHECK-LABEL: test_fewargs: ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]] ; CHECK: mov x[[FPRBASE:[0-9]+]], sp ; CHECK: str q7, [x[[FPRBASE]], #96] ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]] ; CHECK: str x7, [x[[GPRBASE]], #32] +; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]] +; CHECK-NOFP-NOT: str q7, +; CHECK-NOFP: mov x[[GPRBASE:[0-9]+]], sp +; CHECK-NOFP: str x7, [x[[GPRBASE]], #24] + ; Omit the middle ones ; CHECK: str q1, [sp] ; CHECK: str x3, [sp, #[[GPRFROMSP]]] +; CHECK-NOFP-NOT: str q1, [sp] +; CHECK-NOFP: str x4, [sp] + %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var @@ -63,11 +89,20 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) { ; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]] ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var +; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28] +; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #31 +; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24] +; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #32 +; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8] +; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]] +; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] + ret void } define void @test_nospare([8 x i64], [8 x float], ...) { -; CHECK: test_nospare: +; CHECK-LABEL: test_nospare: %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) @@ -75,18 +110,25 @@ define void @test_nospare([8 x i64], [8 x float], ...) { ; CHECK: mov [[STACK:x[0-9]+]], sp ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP-NOT: sub sp, sp +; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #64 +; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] ret void } ; If there are non-variadic arguments on the stack (here two i64s) then the ; __stack field should point just past them. define void @test_offsetstack([10 x i64], [3 x float], ...) { -; CHECK: test_offsetstack: +; CHECK-LABEL: test_offsetstack: ; CHECK: sub sp, sp, #80 ; CHECK: mov x[[FPRBASE:[0-9]+]], sp ; CHECK: str q7, [x[[FPRBASE]], #64] ; CHECK-NOT: str x{{[0-9]+}}, + +; CHECK-NOFP-NOT: str q7, +; CHECK-NOT: str x7, + ; Omit the middle ones ; CHECK: str q3, [sp] @@ -102,20 +144,27 @@ define void @test_offsetstack([10 x i64], [3 x float], ...) { ; CHECK: add [[STACK:x[0-9]+]], sp, #96 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var +; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #40 +; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28] +; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24] ret void } declare void @llvm.va_end(i8*) define void @test_va_end() nounwind { -; CHECK: test_va_end: +; CHECK-LABEL: test_va_end: ; CHECK-NEXT: BB#0 +; CHECK-NOFP: BB#0 %addr = bitcast %va_list* @var to i8* call void @llvm.va_end(i8* %addr) ret void ; CHECK-NEXT: ret +; CHECK-NOFP-NEXT: ret } declare void @llvm.va_copy(i8* %dest, i8* %src) @@ -123,7 +172,7 @@ declare void @llvm.va_copy(i8* %dest, i8* %src) @second_list = global %va_list zeroinitializer define void @test_va_copy() { -; CHECK: test_va_copy: +; CHECK-LABEL: test_va_copy: %srcaddr = bitcast %va_list* @var to i8* %dstaddr = bitcast %va_list* @second_list to i8* call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr) @@ -131,14 +180,25 @@ define void @test_va_copy() { ; Check beginning and end again: ; CHECK: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var] +; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var +; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var] +; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var + ; CHECK: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list] +; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24] ; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list -; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var -; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24] ; CHECK: str [[BLOCK]], [x[[DEST_LIST]], #24] +; CHECK-NOFP: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list] + +; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24] +; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list + +; CHECK-NOFP: str [[BLOCK]], [x[[DEST_LIST]], #24] + ret void ; CHECK: ret +; CHECK-NOFP: ret } diff --git a/test/CodeGen/AArch64/zero-reg.ll b/test/CodeGen/AArch64/zero-reg.ll index fef0437ae7f3e..9b1e52770ce42 100644 --- a/test/CodeGen/AArch64/zero-reg.ll +++ b/test/CodeGen/AArch64/zero-reg.ll @@ -4,7 +4,7 @@ @var64 = global i64 0 define void @test_zr() { -; CHECK: test_zr: +; CHECK-LABEL: test_zr: store i32 0, i32* @var32 ; CHECK: str wzr, [{{x[0-9]+}}, #:lo12:var32] @@ -16,7 +16,7 @@ define void @test_zr() { } define void @test_sp(i32 %val) { -; CHECK: test_sp: +; CHECK-LABEL: test_sp: ; Important correctness point here is that LLVM doesn't try to use xzr ; as an addressing register: "str w0, [xzr]" is not a valid A64 @@ -28,4 +28,4 @@ define void @test_sp(i32 %val) { ret void ; CHECK: ret -}
\ No newline at end of file +} |