From d99dafe2e4a385dd2a6c76da6d8258deb100657b Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Thu, 20 Apr 2017 21:19:10 +0000 Subject: Vendor import of llvm trunk r300890: https://llvm.org/svn/llvm-project/llvm/trunk@300890 --- test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll | 16 + test/CodeGen/AArch64/arm64-abi.ll | 4 +- test/CodeGen/AArch64/nonlazybind.ll | 40 + .../AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll | 1326 +++-- .../code-object-metadata-from-llvm-ir-full.ll | 6 +- test/CodeGen/AMDGPU/exceed-max-sgprs.ll | 2 +- test/CodeGen/AMDGPU/flat-scratch-reg.ll | 59 +- test/CodeGen/AMDGPU/frame-index-amdgiz.ll | 55 + test/CodeGen/AMDGPU/hsa-func-align.ll | 18 + test/CodeGen/AMDGPU/hsa-func.ll | 27 +- test/CodeGen/AMDGPU/loop_break.ll | 2 +- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll | 180 +- test/CodeGen/AMDGPU/nested-loop-conditions.ll | 23 +- test/CodeGen/AMDGPU/ret_jump.ll | 2 +- test/CodeGen/AMDGPU/select-vectors.ll | 389 +- .../ARM/GlobalISel/arm-instruction-select.mir | 241 + test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll | 39 + test/CodeGen/ARM/GlobalISel/arm-isel.ll | 56 +- test/CodeGen/ARM/GlobalISel/arm-legalizer.mir | 156 + test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir | 164 + test/CodeGen/ARM/alloc-no-stack-realign.ll | 101 +- test/CodeGen/ARM/build-attributes.ll | 461 +- test/CodeGen/ARM/darwin-tls-preserved.ll | 24 + test/CodeGen/ARM/divmod-hwdiv.ll | 37 + test/CodeGen/ARM/fpoffset_overflow.mir | 94 + test/CodeGen/ARM/memcpy-inline.ll | 17 +- test/CodeGen/ARM/memset-inline.ll | 6 +- test/CodeGen/ARM/vbits.ll | 560 +- test/CodeGen/ARM/vector-load.ll | 17 +- test/CodeGen/ARM/vector-store.ll | 10 + test/CodeGen/ARM/vlddup.ll | 17 + test/CodeGen/ARM/vldlane.ll | 16 + test/CodeGen/ARM/vtbl.ll | 2 +- test/CodeGen/AVR/alloca.ll | 6 +- test/CodeGen/AVR/call.ll | 29 +- test/CodeGen/AVR/directmem.ll | 32 +- test/CodeGen/AVR/inline-asm/multibyte.ll | 135 - test/CodeGen/AVR/varargs.ll | 8 +- test/CodeGen/Hexagon/addrmode-globoff.mir | 25 + test/CodeGen/Mips/msa/shift_constant_pool.ll | 171 + test/CodeGen/Mips/msa/shift_no_and.ll | 460 ++ test/CodeGen/PowerPC/andc.ll | 50 +- test/CodeGen/WebAssembly/returned.ll | 31 + test/CodeGen/X86/GlobalISel/X86-regbankselect.mir | 28 + test/CodeGen/X86/GlobalISel/binop-isel.ll | 186 - test/CodeGen/X86/GlobalISel/binop.ll | 186 + .../GlobalISel/frameIndex-instructionselect.mir | 36 - test/CodeGen/X86/GlobalISel/legalize-const.mir | 43 - test/CodeGen/X86/GlobalISel/legalize-constant.mir | 43 + test/CodeGen/X86/GlobalISel/legalize-trunc.mir | 31 + test/CodeGen/X86/GlobalISel/memop-isel.ll | 189 - test/CodeGen/X86/GlobalISel/memop.ll | 189 + test/CodeGen/X86/GlobalISel/select-add.mir | 226 + test/CodeGen/X86/GlobalISel/select-frameIndex.mir | 36 + test/CodeGen/X86/GlobalISel/select-memop.mir | 582 ++ test/CodeGen/X86/GlobalISel/select-sub.mir | 225 + test/CodeGen/X86/GlobalISel/select-trunc.mir | 183 + test/CodeGen/X86/GlobalISel/trunc.ll | 57 + .../X86/GlobalISel/x86_64-instructionselect.mir | 1022 ---- test/CodeGen/X86/MergeConsecutiveStores.ll | 19 + test/CodeGen/X86/avx-logic.ll | 36 +- test/CodeGen/X86/avx512-ext.ll | 8 +- test/CodeGen/X86/avx512-mask-op.ll | 15 +- test/CodeGen/X86/bswap_tree.ll | 105 + test/CodeGen/X86/bswap_tree2.ll | 150 + test/CodeGen/X86/combine-or.ll | 16 +- test/CodeGen/X86/dbg-baseptr.ll | 75 + test/CodeGen/X86/extract-store.ll | 37 + test/CodeGen/X86/fp128-extract.ll | 22 + test/CodeGen/X86/i64-to-float.ll | 46 +- test/CodeGen/X86/known-signbits-vector.ll | 28 +- test/CodeGen/X86/madd.ll | 324 +- test/CodeGen/X86/merge_store.ll | 31 + test/CodeGen/X86/sse-schedule.ll | 2415 ++++++++ test/CodeGen/X86/sse2-schedule.ll | 6039 ++++++++++++++++++++ test/CodeGen/X86/tail-merge-after-mbp.ll | 94 - test/CodeGen/X86/tail-merge-after-mbp.mir | 105 + test/CodeGen/X86/vector-rotate-128.ll | 11 - test/CodeGen/X86/vector-rotate-256.ll | 17 +- test/CodeGen/X86/x86-16.ll | 9 +- 80 files changed, 14851 insertions(+), 3127 deletions(-) create mode 100644 test/CodeGen/AArch64/nonlazybind.ll create mode 100644 test/CodeGen/AMDGPU/frame-index-amdgiz.ll create mode 100644 test/CodeGen/AMDGPU/hsa-func-align.ll create mode 100644 test/CodeGen/ARM/darwin-tls-preserved.ll create mode 100644 test/CodeGen/ARM/divmod-hwdiv.ll create mode 100644 test/CodeGen/ARM/fpoffset_overflow.mir delete mode 100644 test/CodeGen/AVR/inline-asm/multibyte.ll create mode 100644 test/CodeGen/Hexagon/addrmode-globoff.mir create mode 100644 test/CodeGen/Mips/msa/shift_constant_pool.ll create mode 100644 test/CodeGen/Mips/msa/shift_no_and.ll delete mode 100644 test/CodeGen/X86/GlobalISel/binop-isel.ll create mode 100644 test/CodeGen/X86/GlobalISel/binop.ll delete mode 100644 test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir delete mode 100644 test/CodeGen/X86/GlobalISel/legalize-const.mir create mode 100644 test/CodeGen/X86/GlobalISel/legalize-constant.mir create mode 100644 test/CodeGen/X86/GlobalISel/legalize-trunc.mir delete mode 100644 test/CodeGen/X86/GlobalISel/memop-isel.ll create mode 100644 test/CodeGen/X86/GlobalISel/memop.ll create mode 100644 test/CodeGen/X86/GlobalISel/select-add.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-frameIndex.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-memop.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-sub.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-trunc.mir create mode 100644 test/CodeGen/X86/GlobalISel/trunc.ll delete mode 100644 test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir create mode 100644 test/CodeGen/X86/bswap_tree.ll create mode 100644 test/CodeGen/X86/bswap_tree2.ll create mode 100644 test/CodeGen/X86/dbg-baseptr.ll create mode 100644 test/CodeGen/X86/fp128-extract.ll create mode 100644 test/CodeGen/X86/sse-schedule.ll create mode 100644 test/CodeGen/X86/sse2-schedule.ll delete mode 100644 test/CodeGen/X86/tail-merge-after-mbp.ll create mode 100644 test/CodeGen/X86/tail-merge-after-mbp.mir (limited to 'test/CodeGen') diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index e40199d82c9d..71ea9d54f647 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -154,3 +154,19 @@ continue: define fp128 @test_quad_dump() { ret fp128 0xL00000000000000004000000000000000 } + +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg0(p0) = G_EXTRACT_VECTOR_ELT %vreg1, %vreg2; (in function: vector_of_pointers_extractelement) +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for vector_of_pointers_extractelement +; FALLBACK-WITH-REPORT-OUT-LABEL: vector_of_pointers_extractelement: +define void @vector_of_pointers_extractelement() { + %dummy = extractelement <2 x i16*> undef, i32 0 + ret void +} + +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg0(<2 x p0>) = G_INSERT_VECTOR_ELT %vreg1, %vreg2, %vreg3; (in function: vector_of_pointers_insertelement +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for vector_of_pointers_insertelement +; FALLBACK-WITH-REPORT-OUT-LABEL: vector_of_pointers_insertelement: +define void @vector_of_pointers_insertelement() { + %dummy = insertelement <2 x i16*> undef, i16* null, i32 0 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll index 6cf0ab35b9b5..5be84b7d493b 100644 --- a/test/CodeGen/AArch64/arm64-abi.ll +++ b/test/CodeGen/AArch64/arm64-abi.ll @@ -43,9 +43,7 @@ entry: ; CHECK-LABEL: i8i16caller ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5. ; They are i8, i16, i8 and i8. -; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5] -; CHECK-DAG: strb {{w[0-9]+}}, [sp, #4] -; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2] +; CHECK-DAG: stur {{w[0-9]+}}, [sp, #2] ; CHECK-DAG: strb {{w[0-9]+}}, [sp] ; CHECK: bl ; FAST-LABEL: i8i16caller diff --git a/test/CodeGen/AArch64/nonlazybind.ll b/test/CodeGen/AArch64/nonlazybind.ll new file mode 100644 index 000000000000..4355d45fe84d --- /dev/null +++ b/test/CodeGen/AArch64/nonlazybind.ll @@ -0,0 +1,40 @@ +; RUN: llc -mtriple=aarch64-apple-ios %s -o - -aarch64-enable-nonlazybind | FileCheck %s +; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=CHECK-NORMAL + +define void @local() nonlazybind { + ret void +} + +declare void @nonlocal() nonlazybind + +define void @test_laziness() { +; CHECK-LABEL: test_laziness: + +; CHECK: bl _local + +; CHECK: adrp x[[TMP:[0-9]+]], _nonlocal@GOTPAGE +; CHECK: ldr [[FUNC:x[0-9]+]], [x[[TMP]], _nonlocal@GOTPAGEOFF] +; CHECK: blr [[FUNC]] + +; CHECK-NORMAL-LABEL: test_laziness: +; CHECK-NORMAL: bl _local +; CHEKC-NORMAL: bl _nonlocal + + call void @local() + call void @nonlocal() + ret void +} + +define void @test_laziness_tail() { +; CHECK-LABEL: test_laziness_tail: + +; CHECK: adrp x[[TMP:[0-9]+]], _nonlocal@GOTPAGE +; CHECK: ldr [[FUNC:x[0-9]+]], [x[[TMP]], _nonlocal@GOTPAGEOFF] +; CHECK: br [[FUNC]] + +; CHECK-NORMAL-LABEL: test_laziness_tail: +; CHECK-NORMAL: b _nonlocal + + tail call void @nonlocal() + ret void +} diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll index 95a206e1dd00..8e5a512dd3c9 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll @@ -3,333 +3,358 @@ ; GCN-LABEL: @add_i3( ; SI: %r = add i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_i3(i3 %a, i3 %b) { %r = add i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_i3( ; SI: %r = add nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) { %r = add nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_i3( ; SI: %r = add nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) { %r = add nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_i3( ; SI: %r = add nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) { %r = add nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_i3( ; SI: %r = sub i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) { %r = sub i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_i3( ; SI: %r = sub nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) { %r = sub nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_i3( ; SI: %r = sub nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) { %r = sub nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_i3( ; SI: %r = sub nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) { %r = sub nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_i3( ; SI: %r = mul i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) { %r = mul i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_i3( ; SI: %r = mul nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) { %r = mul nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_i3( ; SI: %r = mul nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) { %r = mul nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_i3( ; SI: %r = mul nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) { %r = mul nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_i3( ; SI: %r = urem i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @urem_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @urem_i3(i3 %a, i3 %b) { %r = urem i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_i3( ; SI: %r = srem i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @srem_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @srem_i3(i3 %a, i3 %b) { %r = srem i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_i3( ; SI: %r = shl i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) { %r = shl i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_i3( ; SI: %r = shl nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) { %r = shl nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_i3( ; SI: %r = shl nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) { %r = shl nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_i3( ; SI: %r = shl nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) { %r = shl nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_i3( ; SI: %r = lshr i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @lshr_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) { %r = lshr i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_i3( ; SI: %r = lshr exact i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @lshr_exact_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) { %r = lshr exact i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_i3( ; SI: %r = ashr i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @ashr_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) { %r = ashr i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_i3( ; SI: %r = ashr exact i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @ashr_exact_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) { %r = ashr exact i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @and_i3( ; SI: %r = and i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @and_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @and_i3(i3 %a, i3 %b) { %r = and i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @or_i3( ; SI: %r = or i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @or_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @or_i3(i3 %a, i3 %b) { %r = or i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_i3( ; SI: %r = xor i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @xor_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) { %r = xor i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_i3( ; SI: %cmp = icmp eq i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]] @@ -337,17 +362,18 @@ define i3 @xor_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_eq_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) { %cmp = icmp eq i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_i3( ; SI: %cmp = icmp ne i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]] @@ -355,17 +381,18 @@ define i3 @select_eq_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ne_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) { %cmp = icmp ne i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_i3( ; SI: %cmp = icmp ugt i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]] @@ -373,17 +400,18 @@ define i3 @select_ne_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ugt_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) { %cmp = icmp ugt i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_i3( ; SI: %cmp = icmp uge i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]] @@ -391,17 +419,18 @@ define i3 @select_ugt_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_uge_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) { %cmp = icmp uge i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_i3( ; SI: %cmp = icmp ult i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]] @@ -409,17 +438,18 @@ define i3 @select_uge_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ult_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) { %cmp = icmp ult i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_i3( ; SI: %cmp = icmp ule i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]] @@ -427,17 +457,18 @@ define i3 @select_ult_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ule_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) { %cmp = icmp ule i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_i3( ; SI: %cmp = icmp sgt i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]] @@ -445,17 +476,18 @@ define i3 @select_ule_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_sgt_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) { %cmp = icmp sgt i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_i3( ; SI: %cmp = icmp sge i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]] @@ -463,17 +495,18 @@ define i3 @select_sgt_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_sge_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) { %cmp = icmp sge i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_i3( ; SI: %cmp = icmp slt i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]] @@ -481,17 +514,18 @@ define i3 @select_sge_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_slt_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) { %cmp = icmp slt i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_i3( ; SI: %cmp = icmp sle i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]] @@ -499,384 +533,415 @@ define i3 @select_slt_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_sle_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) { %cmp = icmp sle i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } declare i3 @llvm.bitreverse.i3(i3) ; GCN-LABEL: @bitreverse_i3( ; SI: %brev = call i3 @llvm.bitreverse.i3(i3 %a) -; SI-NEXT: ret i3 %brev +; SI-NEXT: store volatile i3 %brev ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 29 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[S_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @bitreverse_i3(i3 %a) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @bitreverse_i3(i3 %a) { %brev = call i3 @llvm.bitreverse.i3(i3 %a) - ret i3 %brev + store volatile i3 %brev, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_i16( ; SI: %r = add i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_i16(i16 %a, i16 %b) { %r = add i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_add_i16( -; VI: ret i16 3 -define i16 @constant_add_i16() { +; VI: store volatile i16 3 +define amdgpu_kernel void @constant_add_i16() { %r = add i16 1, 2 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_add_nsw_i16( -; VI: ret i16 3 -define i16 @constant_add_nsw_i16() { +; VI: store volatile i16 3 +define amdgpu_kernel void @constant_add_nsw_i16() { %r = add nsw i16 1, 2 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_add_nuw_i16( -; VI: ret i16 3 -define i16 @constant_add_nuw_i16() { +; VI: store volatile i16 3 +define amdgpu_kernel void @constant_add_nuw_i16() { %r = add nsw i16 1, 2 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_i16( ; SI: %r = add nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) { %r = add nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_i16( ; SI: %r = add nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) { %r = add nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_i16( ; SI: %r = add nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) { %r = add nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_i16( ; SI: %r = sub i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) { %r = sub i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_i16( ; SI: %r = sub nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) { %r = sub nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_i16( ; SI: %r = sub nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) { %r = sub nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_i16( ; SI: %r = sub nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) { %r = sub nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_i16( ; SI: %r = mul i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) { %r = mul i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_i16( ; SI: %r = mul nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) { %r = mul nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_i16( ; SI: %r = mul nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) { %r = mul nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_i16( ; SI: %r = mul nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) { %r = mul nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_i16( ; SI: %r = urem i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @urem_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @urem_i16(i16 %a, i16 %b) { %r = urem i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_i16( ; SI: %r = srem i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @srem_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @srem_i16(i16 %a, i16 %b) { %r = srem i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_i16( ; SI: %r = shl i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) { %r = shl i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_i16( ; SI: %r = shl nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) { %r = shl nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_i16( ; SI: %r = shl nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) { %r = shl nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_i16( ; SI: %r = shl nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) { %r = shl nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_i16( ; SI: %r = lshr i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @lshr_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) { %r = lshr i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_i16( ; SI: %r = lshr exact i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @lshr_exact_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) { %r = lshr exact i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_i16( ; SI: %r = ashr i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @ashr_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) { %r = ashr i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_i16( ; SI: %r = ashr exact i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @ashr_exact_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) { %r = ashr exact i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_lshr_exact_i16( -; VI: ret i16 2 -define i16 @constant_lshr_exact_i16(i16 %a, i16 %b) { +; VI: store volatile i16 2 +define amdgpu_kernel void @constant_lshr_exact_i16(i16 %a, i16 %b) { %r = lshr exact i16 4, 1 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @and_i16( ; SI: %r = and i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @and_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @and_i16(i16 %a, i16 %b) { %r = and i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @or_i16( ; SI: %r = or i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @or_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @or_i16(i16 %a, i16 %b) { %r = or i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_i16( ; SI: %r = xor i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @xor_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) { %r = xor i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_i16( ; SI: %cmp = icmp eq i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]] @@ -884,17 +949,18 @@ define i16 @xor_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_eq_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) { %cmp = icmp eq i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_i16( ; SI: %cmp = icmp ne i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]] @@ -902,17 +968,18 @@ define i16 @select_eq_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ne_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) { %cmp = icmp ne i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_i16( ; SI: %cmp = icmp ugt i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]] @@ -920,17 +987,18 @@ define i16 @select_ne_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ugt_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) { %cmp = icmp ugt i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_i16( ; SI: %cmp = icmp uge i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]] @@ -938,17 +1006,18 @@ define i16 @select_ugt_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_uge_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) { %cmp = icmp uge i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_i16( ; SI: %cmp = icmp ult i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]] @@ -956,17 +1025,18 @@ define i16 @select_uge_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ult_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) { %cmp = icmp ult i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_i16( ; SI: %cmp = icmp ule i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]] @@ -974,17 +1044,18 @@ define i16 @select_ult_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ule_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) { %cmp = icmp ule i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_i16( ; SI: %cmp = icmp sgt i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]] @@ -992,17 +1063,18 @@ define i16 @select_ule_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_sgt_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) { %cmp = icmp sgt i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_i16( ; SI: %cmp = icmp sge i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]] @@ -1010,17 +1082,18 @@ define i16 @select_sgt_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_sge_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) { %cmp = icmp sge i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_i16( ; SI: %cmp = icmp slt i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]] @@ -1028,17 +1101,18 @@ define i16 @select_sge_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_slt_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) { %cmp = icmp slt i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_i16( ; SI: %cmp = icmp sle i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]] @@ -1046,356 +1120,384 @@ define i16 @select_slt_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_sle_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) { %cmp = icmp sle i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } declare i16 @llvm.bitreverse.i16(i16) + ; GCN-LABEL: @bitreverse_i16( ; SI: %brev = call i16 @llvm.bitreverse.i16(i16 %a) -; SI-NEXT: ret i16 %brev +; SI-NEXT: store volatile i16 %brev ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 16 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[S_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @bitreverse_i16(i16 %a) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @bitreverse_i16(i16 %a) { %brev = call i16 @llvm.bitreverse.i16(i16 %a) - ret i16 %brev + store volatile i16 %brev, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_3xi15( ; SI: %r = add <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_3xi15( ; SI: %r = add nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_3xi15( ; SI: %r = add nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_3xi15( ; SI: %r = add nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_3xi15( ; SI: %r = sub <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_3xi15( ; SI: %r = sub nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_3xi15( ; SI: %r = sub nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_3xi15( ; SI: %r = sub nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_3xi15( ; SI: %r = mul <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_3xi15( ; SI: %r = mul nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_3xi15( ; SI: %r = mul nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_3xi15( ; SI: %r = mul nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_3xi15( ; SI: %r = urem <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @urem_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @urem_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = urem <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_3xi15( ; SI: %r = srem <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @srem_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @srem_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = srem <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_3xi15( ; SI: %r = shl <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_3xi15( ; SI: %r = shl nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_3xi15( ; SI: %r = shl nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_3xi15( ; SI: %r = shl nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_3xi15( ; SI: %r = lshr <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = lshr <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_3xi15( ; SI: %r = lshr exact <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = lshr exact <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_3xi15( ; SI: %r = ashr <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = ashr <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_3xi15( ; SI: %r = ashr exact <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = ashr exact <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @and_3xi15( ; SI: %r = and <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @and_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = and <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @or_3xi15( ; SI: %r = or <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @or_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = or <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_3xi15( ; SI: %r = xor <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @xor_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = xor <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_3xi15( ; SI: %cmp = icmp eq <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1403,17 +1505,18 @@ define <3 x i15> @xor_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp eq <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_3xi15( ; SI: %cmp = icmp ne <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1421,17 +1524,18 @@ define <3 x i15> @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ne <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_3xi15( ; SI: %cmp = icmp ugt <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1439,17 +1543,18 @@ define <3 x i15> @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ugt <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_3xi15( ; SI: %cmp = icmp uge <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1457,17 +1562,18 @@ define <3 x i15> @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp uge <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_3xi15( ; SI: %cmp = icmp ult <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1475,17 +1581,18 @@ define <3 x i15> @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ult <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_3xi15( ; SI: %cmp = icmp ule <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1493,17 +1600,18 @@ define <3 x i15> @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ule <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_3xi15( ; SI: %cmp = icmp sgt <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1511,17 +1619,18 @@ define <3 x i15> @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp sgt <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_3xi15( ; SI: %cmp = icmp sge <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1529,17 +1638,18 @@ define <3 x i15> @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp sge <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_3xi15( ; SI: %cmp = icmp slt <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1547,17 +1657,18 @@ define <3 x i15> @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp slt <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_3xi15( ; SI: %cmp = icmp sle <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1565,356 +1676,383 @@ define <3 x i15> @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp sle <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } declare <3 x i15> @llvm.bitreverse.v3i15(<3 x i15>) ; GCN-LABEL: @bitreverse_3xi15( ; SI: %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a) -; SI-NEXT: ret <3 x i15> %brev +; SI-NEXT: store volatile <3 x i15> %brev ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @bitreverse_3xi15(<3 x i15> %a) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) { %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a) - ret <3 x i15> %brev + store volatile <3 x i15> %brev, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_3xi16( ; SI: %r = add <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_3xi16( ; SI: %r = add nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_3xi16( ; SI: %r = add nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_3xi16( ; SI: %r = add nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_3xi16( ; SI: %r = sub <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_3xi16( ; SI: %r = sub nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_3xi16( ; SI: %r = sub nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_3xi16( ; SI: %r = sub nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_3xi16( ; SI: %r = mul <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_3xi16( ; SI: %r = mul nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_3xi16( ; SI: %r = mul nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_3xi16( ; SI: %r = mul nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_3xi16( ; SI: %r = urem <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @urem_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @urem_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = urem <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_3xi16( ; SI: %r = srem <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @srem_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = srem <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_3xi16( ; SI: %r = shl <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_3xi16( ; SI: %r = shl nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_3xi16( ; SI: %r = shl nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_3xi16( ; SI: %r = shl nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_3xi16( ; SI: %r = lshr <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = lshr <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_3xi16( ; SI: %r = lshr exact <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = lshr exact <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_3xi16( ; SI: %r = ashr <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = ashr <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_3xi16( ; SI: %r = ashr exact <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = ashr exact <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @and_3xi16( ; SI: %r = and <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @and_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = and <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @or_3xi16( ; SI: %r = or <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @or_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = or <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_3xi16( ; SI: %r = xor <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = xor <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_3xi16( ; SI: %cmp = icmp eq <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1922,17 +2060,18 @@ define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp eq <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_3xi16( ; SI: %cmp = icmp ne <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1940,17 +2079,18 @@ define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ne <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_3xi16( ; SI: %cmp = icmp ugt <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1958,17 +2098,18 @@ define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ugt <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_3xi16( ; SI: %cmp = icmp uge <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1976,17 +2117,18 @@ define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp uge <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_3xi16( ; SI: %cmp = icmp ult <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1994,17 +2136,18 @@ define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ult <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_3xi16( ; SI: %cmp = icmp ule <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2012,17 +2155,18 @@ define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ule <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_3xi16( ; SI: %cmp = icmp sgt <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2030,17 +2174,18 @@ define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp sgt <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_3xi16( ; SI: %cmp = icmp sge <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2048,17 +2193,18 @@ define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp sge <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_3xi16( ; SI: %cmp = icmp slt <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2066,17 +2212,18 @@ define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp slt <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_3xi16( ; SI: %cmp = icmp sle <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2084,23 +2231,26 @@ define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp sle <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>) + ; GCN-LABEL: @bitreverse_3xi16( ; SI: %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a) -; SI-NEXT: ret <3 x i16> %brev +; SI-NEXT: store volatile <3 x i16> %brev ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @bitreverse_3xi16(<3 x i16> %a) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) { %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a) - ret <3 x i16> %brev + store volatile <3 x i16> %brev, <3 x i16> addrspace(1)* undef + ret void } diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll index 88ba310a92ca..a68ddabd9560 100644 --- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll +++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll @@ -1253,8 +1253,8 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, ; NOTES-NEXT: Owner Data size Description ; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) ; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) -; GFX700: AMD 0x00009171 Unknown note type: (0x0000000a) -; GFX800: AMD 0x00009190 Unknown note type: (0x0000000a) -; GFX900: AMD 0x00009171 Unknown note type: (0x0000000a) +; GFX700: AMD 0x00008b06 Unknown note type: (0x0000000a) +; GFX800: AMD 0x00008e6a Unknown note type: (0x0000000a) +; GFX900: AMD 0x00008b06 Unknown note type: (0x0000000a) ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll index 40d115bfc060..207dfce75f16 100644 --- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -38,7 +38,7 @@ define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 { ret void } -; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr +; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 23f40daf3d23..5705cbc99443 100644 --- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -44,12 +44,12 @@ entry: ; HSA-VI-NOXNACK: is_xnack_enabled = 0 ; HSA-VI-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 8 -; VI-NOXNACK: ; NumSgprs: 8 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 8 -; HSA-VI-NOXNACK: ; NumSgprs: 8 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() @@ -60,14 +60,49 @@ entry: ; HSA-NOXNACK: is_xnack_enabled = 0 ; HSA-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 10 -; VI-NOXNACK: ; NumSgprs: 10 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 10 -; HSA-VI-NOXNACK: ; NumSgprs: 10 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() ret void } + +; Make sure used SGPR count for flat_scr is correct when there is no +; scratch usage and implicit flat uses. + +; GCN-LABEL: {{^}}use_flat_scr: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_lo: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_lo() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_LO}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_hi: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_hi() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_HI}"() + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/frame-index-amdgiz.ll b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll new file mode 100644 index 000000000000..dd46403b68af --- /dev/null +++ b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll @@ -0,0 +1,55 @@ +; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; +; The original OpenCL kernel: +; kernel void f(global int *a, int i, int j) { +; int x[100]; +; x[i] = 7; +; a[0] = x[j]; +; } +; clang -cc1 -triple amdgcn---amdgizcl -emit-llvm -o - + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" +target triple = "amdgcn---amdgiz" + +define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 { +entry: +; CHECK: s_load_dword s2, s[0:1], 0xb +; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CHECK: s_load_dword s0, s[0:1], 0xc +; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; CHECK: s_mov_b32 s10, -1 +; CHECK: s_waitcnt lgkmcnt(0) +; CHECK: s_lshl_b32 s1, s2, 2 +; CHECK: v_mov_b32_e32 v0, 4 +; CHECK: s_mov_b32 s11, 0xe8f000 +; CHECK: v_add_i32_e32 v1, vcc, s1, v0 +; CHECK: v_mov_b32_e32 v2, 7 +; CHECK: s_lshl_b32 s0, s0, 2 +; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen +; CHECK: v_add_i32_e32 v0, vcc, s0, v0 +; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen +; CHECK: s_mov_b32 s7, 0xf000 +; CHECK: s_mov_b32 s6, -1 +; CHECK: s_waitcnt vmcnt(0) +; CHECK: buffer_store_dword v0, off, s[4:7], 0 +; CHECK: s_endpgm + + %x = alloca [100 x i32], align 4, addrspace(5) + %0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i + store i32 7, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j + %1 = load i32, i32 addrspace(5)* %arrayidx2, align 4 + store i32 %1, i32 addrspace(1)* %a, align 4 + call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0 + ret void +} + +declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #1 + +declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } diff --git a/test/CodeGen/AMDGPU/hsa-func-align.ll b/test/CodeGen/AMDGPU/hsa-func-align.ll new file mode 100644 index 000000000000..a00f5e2669d1 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-func-align.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=HSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj < %s | llvm-readobj -symbols -s -sd | FileCheck -check-prefix=ELF %s + +; ELF: Section { +; ELF: Name: .text +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_EXECINSTR (0x4) +; ELF: AddressAlignment: 32 +; ELF: } + +; HSA: .globl simple_align16 +; HSA: .p2align 5 +define void @simple_align16(i32 addrspace(1)* addrspace(2)* %ptr.out) align 32 { +entry: + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll index b4cdd4030d86..d96b796d4495 100644 --- a/test/CodeGen/AMDGPU/hsa-func.ll +++ b/test/CodeGen/AMDGPU/hsa-func.ll @@ -14,6 +14,7 @@ ; ELF: Flags [ (0x6) ; ELF: SHF_ALLOC (0x2) ; ELF: SHF_EXECINSTR (0x4) +; ELF: AddressAlignment: 4 ; ELF: } ; ELF: SHT_NOTE @@ -26,7 +27,7 @@ ; ELF: Symbol { ; ELF: Name: simple -; ELF: Size: 292 +; ELF: Size: 44 ; ELF: Type: Function (0x2) ; ELF: } @@ -36,12 +37,13 @@ ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" ; HSA-NOT: .amdgpu_hsa_kernel simple +; HSA: .globl simple +; HSA: .p2align 2 ; HSA: {{^}}simple: -; HSA: .amd_kernel_code_t -; HSA: enable_sgpr_private_segment_buffer = 1 -; HSA: enable_sgpr_kernarg_segment_ptr = 1 -; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 +; HSA-NOT: amd_kernel_code_t + +; FIXME: Check this isn't a kernarg load when calling convention implemented. +; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 @@ -52,9 +54,20 @@ ; HSA: .Lfunc_end0: ; HSA: .size simple, .Lfunc_end0-simple - +; HSA: ; Function info: +; HSA-NOT: COMPUTE_PGM_RSRC2 define void @simple(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out ret void } + +; Ignore explicit alignment that is too low. +; HSA: .globl simple_align2 +; HSA: .p2align 2 +define void @simple_align2(i32 addrspace(1)* addrspace(2)* %ptr.out) align 2 { +entry: + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll index b9df2cb779ad..84c42e8bd1e0 100644 --- a/test/CodeGen/AMDGPU/loop_break.ll +++ b/test/CodeGen/AMDGPU/loop_break.ll @@ -10,7 +10,7 @@ ; OPT: bb4: ; OPT: load volatile -; OPT: xor i1 %cmp1 +; OPT: %cmp1 = icmp sge i32 %tmp, %load ; OPT: call i64 @llvm.amdgcn.if.break( ; OPT: br label %Flow diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 9d0b6b395996..4bd8bff4809a 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -9,18 +9,19 @@ ; StructurizeCFG. ; IR-LABEL: @multi_divergent_region_exit_ret_ret( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) -; IR: %2 = extractvalue { i1, i64 } %1, 0 -; IR: %3 = extractvalue { i1, i64 } %1, 1 -; IR: br i1 %2, label %LeafBlock1, label %Flow +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: %1 = extractvalue { i1, i64 } %0, 0 +; IR: %2 = extractvalue { i1, i64 } %0, 1 +; IR: br i1 %1, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: %7 = extractvalue { i1, i64 } %6, 0 -; IR: %8 = extractvalue { i1, i64 } %6, 1 -; IR: br i1 %7, label %LeafBlock, label %Flow1 +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %6 = extractvalue { i1, i64 } %5, 0 +; IR: %7 = extractvalue { i1, i64 } %5, 1 +; IR: br i1 %6, label %LeafBlock, label %Flow1 ; IR: LeafBlock: ; IR: br label %Flow1 @@ -29,32 +30,32 @@ ; IR: br label %Flow{{$}} ; IR: Flow2: -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: %13 = extractvalue { i1, i64 } %12, 0 -; IR: %14 = extractvalue { i1, i64 } %12, 1 -; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: [[IF:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: %10 = extractvalue { i1, i64 } [[IF]], 0 +; IR: %11 = extractvalue { i1, i64 } [[IF]], 1 +; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %8) -; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) -; IR: %18 = extractvalue { i1, i64 } %17, 0 -; IR: %19 = extractvalue { i1, i64 } %17, 1 -; IR: br i1 %18, label %exit1, label %Flow2 +; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] +; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %7) +; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) +; IR: %15 = extractvalue { i1, i64 } %14, 0 +; IR: %16 = extractvalue { i1, i64 } %14, 1 +; IR: br i1 %15, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: call void @llvm.amdgcn.end.cf(i64 %11) ; IR: ret void @@ -64,11 +65,9 @@ ; GCN: s_xor_b64 -; FIXME: Why is this compare essentially repeated? -; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] -; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] +; GCN: ; %LeafBlock +; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG:v[0-9]+]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 ; GCN: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec @@ -126,14 +125,15 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock ; IR: UnifiedUnreachableBlock: @@ -181,51 +181,49 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret( -; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2 +; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2 ; IR: llvm.amdgcn.if ; IR: br i1 ; IR: {{^}}Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: br i1 %7, label %LeafBlock, label %Flow1 +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: br i1 %6, label %LeafBlock, label %Flow1 ; IR: {{^}}LeafBlock: -; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1 -; IR: %9 = xor i1 %divergent.cond1, true +; IR: %divergent.cond1 = icmp ne i32 %tmp16, 1 ; IR: br label %Flow1 ; IR: LeafBlock1: -; IR: %uniform.cond0 = icmp eq i32 %arg3, 2 -; IR: %10 = xor i1 %uniform.cond0, true +; IR: %uniform.cond0 = icmp ne i32 %arg3, 2 ; IR: br label %Flow ; IR: Flow2: -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: {{^}}Flow1: -; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] -; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %8) -; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) -; IR: %18 = extractvalue { i1, i64 } %17, 0 -; IR: %19 = extractvalue { i1, i64 } %17, 1 -; IR: br i1 %18, label %exit1, label %Flow2 +; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ] +; IR: %13 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %7) +; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) +; IR: %15 = extractvalue { i1, i64 } %14, 0 +; IR: %16 = extractvalue { i1, i64 } %14, 1 +; IR: br i1 %15, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: call void @llvm.amdgcn.end.cf(i64 %11) ; IR: ret void define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -264,17 +262,18 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) -; IR: br i1 %2, label %LeafBlock1, label %Flow +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: br i1 %1, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -314,13 +313,13 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( ; IR: Flow2: -; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] -; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %20) +; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] +; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %17) ; IR: UnifiedReturnBlock: -; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %15) +; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %12) ; IR: ret float %UnifiedRetVal define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { entry: @@ -387,31 +386,32 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) ; IR: Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) ; IR: Flow2: -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef ; IR-NEXT: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %8) -; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) -; IR: %18 = extractvalue { i1, i64 } %17, 0 -; IR: %19 = extractvalue { i1, i64 } %17, 1 -; IR: br i1 %18, label %exit1, label %Flow2 +; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] +; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %7) +; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) +; IR: %15 = extractvalue { i1, i64 } %14, 0 +; IR: %16 = extractvalue { i1, i64 } %14, 1 +; IR: br i1 %15, label %exit1, label %Flow2 ; IR: exit1: ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef @@ -419,7 +419,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) ; IR-NEXT: ret void define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -475,7 +475,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) ; IR-NEXT: ret void define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -622,15 +622,15 @@ uniform.ret: ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle( ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region -; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] -; IR: br i1 %8, label %uniform.if, label %Flow2 +; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] +; IR: br i1 %6, label %uniform.if, label %Flow2 ; IR: Flow: ; preds = %uniform.then, %uniform.if -; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ] -; IR: br i1 %11, label %uniform.endif, label %uniform.ret0 +; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1, %uniform.if ] +; IR: br i1 %7, label %uniform.endif, label %uniform.ret0 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %5) ; IR-NEXT: ret void define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 { entry: diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll index 672549c8ea63..c0b4eaff60aa 100644 --- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -133,9 +133,9 @@ bb23: ; preds = %bb10 ; IR: Flow1: ; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ] -; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ] +; IR-NEXT: %13 = phi <4 x i32> [ %28, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %14 = phi i32 [ %29, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %15 = phi i1 [ %30, %Flow6 ], [ false, %bb14 ] ; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ] ; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi) ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) @@ -144,9 +144,9 @@ bb23: ; preds = %bb10 ; IR: Flow2: ; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ] -; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ] +; IR-NEXT: %19 = phi <4 x i32> [ %28, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %20 = phi i32 [ %29, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %21 = phi i1 [ %30, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ] ; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23) @@ -156,16 +156,15 @@ bb23: ; preds = %bb10 ; IR: bb21: ; IR: %tmp12 = icmp slt i32 %tmp11, 9 -; IR-NEXT: %27 = xor i1 %tmp12, true -; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken) +; IR-NEXT: %27 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken) ; IR-NEXT: br label %Flow3 ; IR: Flow3: ; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ] -; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ] -; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] -; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] -; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ] +; IR-NEXT: %loop.phi9 = phi i64 [ %27, %bb21 ], [ %loop.phi10, %Flow2 ] +; IR-NEXT: %28 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] +; IR-NEXT: %29 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] +; IR-NEXT: %30 = phi i1 [ %tmp12, %bb21 ], [ %21, %Flow2 ] ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26) ; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4 diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll index f2fbacbab82e..748f98a12c59 100644 --- a/test/CodeGen/AMDGPU/ret_jump.ll +++ b/test/CodeGen/AMDGPU/ret_jump.ll @@ -56,7 +56,7 @@ ret.bb: ; preds = %else, %main_body } ; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable: -; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]] ; GCN: ; BB#{{[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll index 8710fc8c7307..4b00a48211ec 100644 --- a/test/CodeGen/AMDGPU/select-vectors.ll +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -1,69 +1,186 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. +; GCN-LABEL: {{^}}v_select_v2i8: +; SI: v_cndmask_b32 +; SI-NOT: cndmask -; FUNC-LABEL: {{^}}select_v4i8: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { +; GFX9: v_cndmask_b32 +; GFX9-NOT: cndmask + +; This is worse when i16 is legal and packed is not because +; SelectionDAGBuilder for some reason changes the select type. +; VI: v_cndmask_b32 +; VI: v_cndmask_b32 +define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2 + %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2 + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b + store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}v_select_v4i8: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr + %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b + store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v8i8: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr + %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b + store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v16i8: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr + %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b + store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_v4i8: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { %cmp = icmp eq i8 %c, 0 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: {{^}}select_v4i16: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 +; GCN-LABEL: {{^}}select_v2i16: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: v_cndmask_b32 +define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v2i16: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v3i16: ; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { +; SI: cndmask +; SI-NOT: cndmask + +; GFX9: v_cndmask_b32_e32 +; GFX9: cndmask +; GFX9-NOT: cndmask + +; VI: v_cndmask_b32 +; VI: v_cndmask_b32 +; VI: v_cndmask_b32 +define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr + %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v4i16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 ret void } +; GCN-LABEL: {{^}}v_select_v8i16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr + %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b + store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4 + ret void +} + ; FIXME: Expansion with bitwise operations may be better if doing a ; vector select with SGPR inputs. -; FUNC-LABEL: {{^}}s_select_v2i32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: buffer_store_dwordx2 -define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}s_select_v2i32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: buffer_store_dwordx2 +define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 ret void } -; FUNC-LABEL: {{^}}s_select_v4i32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: buffer_store_dwordx4 -define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}s_select_v4i32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: buffer_store_dwordx4 +define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}v_select_v4i32: -; SI: buffer_load_dwordx4 -; SI: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: buffer_store_dwordx4 +; GCN-LABEL: {{^}}v_select_v4i32: +; GCN: buffer_load_dwordx4 +; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: buffer_store_dwordx4 define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 @@ -73,68 +190,68 @@ bb: ret void } -; FUNC-LABEL: {{^}}select_v8i32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v8i32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}s_select_v2f32: -; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} +; GCN-LABEL: {{^}}s_select_v2f32: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] -; SI-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] +; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} -; SI: v_cndmask_b32_e32 -; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] -; SI: v_cndmask_b32_e32 -; SI: buffer_store_dwordx2 -define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { +; GCN: v_cndmask_b32_e32 +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] +; GCN: v_cndmask_b32_e32 +; GCN: buffer_store_dwordx2 +define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}s_select_v4f32: -; SI: s_load_dwordx4 -; SI: s_load_dwordx4 -; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN-LABEL: {{^}}s_select_v4f32: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx4 +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 -; SI: buffer_store_dwordx4 -define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { +; GCN: buffer_store_dwordx4 +define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}v_select_v4f32: -; SI: buffer_load_dwordx4 -; SI: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: buffer_store_dwordx4 +; GCN-LABEL: {{^}}v_select_v4f32: +; GCN: buffer_load_dwordx4 +; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: buffer_store_dwordx4 define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 @@ -144,74 +261,112 @@ bb: ret void } -; FUNC-LABEL: {{^}}select_v8f32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v8f32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}select_v2f64: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v2f64: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}select_v4f64: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v4f64: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}select_v8f64: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v8f64: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 ret void } +; GCN-LABEL: {{^}}v_select_v2f16: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr + %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x half> %a, <2 x half> %b + store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v3f16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr + %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <3 x half> %a, <3 x half> %b + store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v4f16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr + %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x half> %a, <4 x half> %b + store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4 + ret void +} + ; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 66d9033a6d7c..21c774133f89 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -12,6 +12,15 @@ define void @test_fadd_s32() #0 { ret void } define void @test_fadd_s64() #0 { ret void } + define void @test_sub_s8() { ret void } + define void @test_sub_s16() { ret void } + define void @test_sub_s32() { ret void } + + define void @test_mul_s8() #1 { ret void } + define void @test_mul_s16() #1 { ret void } + define void @test_mul_s32() #1 { ret void } + define void @test_mulv5_s32() { ret void } + define void @test_load_from_stack() { ret void } define void @test_load_f32() #0 { ret void } define void @test_load_f64() #0 { ret void } @@ -24,6 +33,7 @@ define void @test_soft_fp_double() #0 { ret void } attributes #0 = { "target-features"="+vfp2,-neonfp" } + attributes #1 = { "target-features"="+v6" } ... --- name: test_zext_s1 @@ -297,6 +307,237 @@ body: | ; CHECK: BX_RET 14, _, implicit %d0 ... --- +name: test_sub_s8 +# CHECK-LABEL: name: test_sub_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr +# CHECK-DAG: id: 2, class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(s8) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s8) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s8) = G_SUB %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _ + + %r0 = COPY %2(s8) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_sub_s16 +# CHECK-LABEL: name: test_sub_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr +# CHECK-DAG: id: 2, class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(s16) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s16) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s16) = G_SUB %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _ + + %r0 = COPY %2(s16) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_sub_s32 +# CHECK-LABEL: name: test_sub_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: 0, class: gpr +# CHECK: id: 1, class: gpr +# CHECK: id: 2, class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s32) = G_SUB %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _ + + %r0 = COPY %2(s32) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_mul_s8 +# CHECK-LABEL: name: test_mul_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK-DAG: id: 0, class: gprnopc +# CHECK-DAG: id: 1, class: gprnopc +# CHECK-DAG: id: 2, class: gprnopc +body: | + bb.0: + liveins: %r0, %r1 + + %0(s8) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s8) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s8) = G_MUL %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _ + + %r0 = COPY %2(s8) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_mul_s16 +# CHECK-LABEL: name: test_mul_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK-DAG: id: 0, class: gprnopc +# CHECK-DAG: id: 1, class: gprnopc +# CHECK-DAG: id: 2, class: gprnopc +body: | + bb.0: + liveins: %r0, %r1 + + %0(s16) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s16) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s16) = G_MUL %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _ + + %r0 = COPY %2(s16) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_mul_s32 +# CHECK-LABEL: name: test_mul_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: 0, class: gprnopc +# CHECK: id: 1, class: gprnopc +# CHECK: id: 2, class: gprnopc +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s32) = G_MUL %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _ + + %r0 = COPY %2(s32) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_mulv5_s32 +# CHECK-LABEL: name: test_mulv5_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: 0, class: gprnopc +# CHECK: id: 1, class: gprnopc +# CHECK: id: 2, class: gprnopc +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s32) = G_MUL %0, %1 + ; CHECK: early-clobber [[VREGRES:%[0-9]+]] = MULv5 [[VREGX]], [[VREGY]], 14, _, _ + + %r0 = COPY %2(s32) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_load_from_stack # CHECK-LABEL: name: test_load_from_stack legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll index a7f5ec33bee3..cf77ce352074 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll @@ -35,6 +35,19 @@ entry: ret i8 %sum } +define i8 @test_sub_i8(i8 %x, i8 %y) { +; CHECK-LABEL: name: test_sub_i8 +; CHECK: liveins: %r0, %r1 +; CHECK-DAG: [[VREGX:%[0-9]+]](s8) = COPY %r0 +; CHECK-DAG: [[VREGY:%[0-9]+]](s8) = COPY %r1 +; CHECK: [[RES:%[0-9]+]](s8) = G_SUB [[VREGX]], [[VREGY]] +; CHECK: %r0 = COPY [[RES]](s8) +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %res = sub i8 %x, %y + ret i8 %res +} + define signext i8 @test_return_sext_i8(i8 %x) { ; CHECK-LABEL: name: test_return_sext_i8 ; CHECK: liveins: %r0 @@ -59,6 +72,19 @@ entry: ret i16 %sum } +define i16 @test_sub_i16(i16 %x, i16 %y) { +; CHECK-LABEL: name: test_sub_i16 +; CHECK: liveins: %r0, %r1 +; CHECK-DAG: [[VREGX:%[0-9]+]](s16) = COPY %r0 +; CHECK-DAG: [[VREGY:%[0-9]+]](s16) = COPY %r1 +; CHECK: [[RES:%[0-9]+]](s16) = G_SUB [[VREGX]], [[VREGY]] +; CHECK: %r0 = COPY [[RES]](s16) +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %res = sub i16 %x, %y + ret i16 %res +} + define zeroext i16 @test_return_zext_i16(i16 %x) { ; CHECK-LABEL: name: test_return_zext_i16 ; CHECK: liveins: %r0 @@ -83,6 +109,19 @@ entry: ret i32 %sum } +define i32 @test_sub_i32(i32 %x, i32 %y) { +; CHECK-LABEL: name: test_sub_i32 +; CHECK: liveins: %r0, %r1 +; CHECK-DAG: [[VREGX:%[0-9]+]](s32) = COPY %r0 +; CHECK-DAG: [[VREGY:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[RES:%[0-9]+]](s32) = G_SUB [[VREGX]], [[VREGY]] +; CHECK: %r0 = COPY [[RES]](s32) +; CHECK: BX_RET 14, _, implicit %r0 +entry: + %res = sub i32 %x, %y + ret i32 %res +} + define i32 @test_stack_args(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) { ; CHECK-LABEL: name: test_stack_args ; CHECK: fixedStack: diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll index 236dcbeb84c5..f3ca2915f306 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple arm-unknown -mattr=+vfp2 -global-isel %s -o - | FileCheck %s +; RUN: llc -mtriple arm-unknown -mattr=+vfp2,+v6 -global-isel %s -o - | FileCheck %s define void @test_void_return() { ; CHECK-LABEL: test_void_return: @@ -67,6 +67,60 @@ entry: ret i32 %sum } +define i8 @test_sub_i8(i8 %x, i8 %y) { +; CHECK-LABEL: test_sub_i8: +; CHECK: sub r0, r0, r1 +; CHECK: bx lr +entry: + %sum = sub i8 %x, %y + ret i8 %sum +} + +define i16 @test_sub_i16(i16 %x, i16 %y) { +; CHECK-LABEL: test_sub_i16: +; CHECK: sub r0, r0, r1 +; CHECK: bx lr +entry: + %sum = sub i16 %x, %y + ret i16 %sum +} + +define i32 @test_sub_i32(i32 %x, i32 %y) { +; CHECK-LABEL: test_sub_i32: +; CHECK: sub r0, r0, r1 +; CHECK: bx lr +entry: + %sum = sub i32 %x, %y + ret i32 %sum +} + +define i8 @test_mul_i8(i8 %x, i8 %y) { +; CHECK-LABEL: test_mul_i8: +; CHECK: mul r0, r0, r1 +; CHECK: bx lr +entry: + %sum = mul i8 %x, %y + ret i8 %sum +} + +define i16 @test_mul_i16(i16 %x, i16 %y) { +; CHECK-LABEL: test_mul_i16: +; CHECK: mul r0, r0, r1 +; CHECK: bx lr +entry: + %sum = mul i16 %x, %y + ret i16 %sum +} + +define i32 @test_mul_i32(i32 %x, i32 %y) { +; CHECK-LABEL: test_mul_i32: +; CHECK: mul r0, r0, r1 +; CHECK: bx lr +entry: + %sum = mul i32 %x, %y + ret i32 %sum +} + define i32 @test_stack_args_i32(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5) { ; CHECK-LABEL: test_stack_args_i32: ; CHECK: add [[P5ADDR:r[0-9]+]], sp, #4 diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index cbff7e12fb77..625d35acf17b 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -7,6 +7,14 @@ define void @test_add_s16() { ret void } define void @test_add_s32() { ret void } + define void @test_sub_s8() { ret void } + define void @test_sub_s16() { ret void } + define void @test_sub_s32() { ret void } + + define void @test_mul_s8() { ret void } + define void @test_mul_s16() { ret void } + define void @test_mul_s32() { ret void } + define void @test_load_from_stack() { ret void } define void @test_legal_loads() #0 { ret void } define void @test_legal_stores() #0 { ret void } @@ -137,6 +145,154 @@ body: | %r0 = COPY %2(s32) BX_RET 14, _, implicit %r0 +... +--- +name: test_sub_s8 +# CHECK-LABEL: name: test_sub_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + %2(s8) = G_SUB %0, %1 + ; G_SUB with s8 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}} + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... +--- +name: test_sub_s16 +# CHECK-LABEL: name: test_sub_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + %2(s16) = G_SUB %0, %1 + ; G_SUB with s16 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}} + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_sub_s32 +# CHECK-LABEL: name: test_sub_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_SUB %0, %1 + ; G_SUB with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_mul_s8 +# CHECK-LABEL: name: test_mul_s8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + %2(s8) = G_MUL %0, %1 + ; G_MUL with s8 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}} + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... +--- +name: test_mul_s16 +# CHECK-LABEL: name: test_mul_s16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + %2(s16) = G_MUL %0, %1 + ; G_MUL with s16 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}} + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_mul_s32 +# CHECK-LABEL: name: test_mul_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_MUL %0, %1 + ; G_MUL with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + ... --- name: test_load_from_stack diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index fbf8d81322f8..e7935832f98a 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -5,6 +5,14 @@ define void @test_add_s8() { ret void } define void @test_add_s1() { ret void } + define void @test_sub_s32() { ret void } + define void @test_sub_s16() { ret void } + define void @test_sub_s8() { ret void } + + define void @test_mul_s32() { ret void } + define void @test_mul_s16() { ret void } + define void @test_mul_s8() { ret void } + define void @test_loads() #0 { ret void } define void @test_stores() #0 { ret void } @@ -124,6 +132,162 @@ body: | %r0 = COPY %2(s1) BX_RET 14, _, implicit %r0 +... +--- +name: test_sub_s32 +# CHECK-LABEL: name: test_sub_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_SUB %0, %1 + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_sub_s16 +# CHECK-LABEL: name: test_sub_s16 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + %2(s16) = G_SUB %0, %1 + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_sub_s8 +# CHECK-LABEL: name: test_sub_s8 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + %2(s8) = G_SUB %0, %1 + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_mul_s32 +# CHECK-LABEL: name: test_mul_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_MUL %0, %1 + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_mul_s16 +# CHECK-LABEL: name: test_mul_s16 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + %2(s16) = G_MUL %0, %1 + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_mul_s8 +# CHECK-LABEL: name: test_mul_s8 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + %2(s8) = G_MUL %0, %1 + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 + ... --- name: test_loads diff --git a/test/CodeGen/ARM/alloc-no-stack-realign.ll b/test/CodeGen/ARM/alloc-no-stack-realign.ll index 0e077b3aee5a..64c279b0f218 100644 --- a/test/CodeGen/ARM/alloc-no-stack-realign.ll +++ b/test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -7,31 +7,32 @@ define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" { entry: -; CHECK-LABEL: test1 -; CHECK: ldr r[[R1:[0-9]+]], [pc, r1] -; CHECK: add r[[R2:[0-9]+]], r1, #48 -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: mov r[[R2:[0-9]+]], r[[R1]] -; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: mov r[[R1:[0-9]+]], sp -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32 -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]! -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #48 -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #32 -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]! -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128] +; CHECK-LABEL: test1: +; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] +; CHECK: mov r[[R2:[0-9]+]], r[[R1]] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48 +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: mov r[[R1:[0-9]+]], #32 +; CHECK: mov r[[R2:[0-9]+]], sp +; CHECK: mov r[[R3:[0-9]+]], r[[R2]] +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128], r[[R1]] +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #48 +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #32 +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] %retval = alloca <16 x float>, align 16 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 store <16 x float> %0, <16 x float>* %retval @@ -42,30 +43,32 @@ entry: define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp { entry: -; CHECK: ldr r[[R1:[0-9]+]], [pc, r1] -; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48 -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: mov r[[R2:[0-9]+]], r[[R1]] -; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: mov r[[R1:[0-9]+]], sp -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #32 -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]! -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #48 -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: add r[[R1:[0-9]+]], r0, #32 -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] -; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]! -; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128] +; CHECK-LABEL: test2: +; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]] +; CHECK: mov r[[R2:[0-9]+]], r[[R1]] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48 +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32 +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: mov r[[R1:[0-9]+]], #32 +; CHECK: mov r[[R2:[0-9]+]], sp +; CHECK: mov r[[R3:[0-9]+]], r[[R2]] +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128], r[[R1]] +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128] +; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #48 +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: add r[[R1:[0-9]+]], r0, #32 +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! +; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] %retval = alloca <16 x float>, align 16 diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll index fc85a3a2e683..699ef6e92a4f 100644 --- a/test/CodeGen/ARM/build-attributes.ll +++ b/test/CodeGen/ARM/build-attributes.ll @@ -231,6 +231,11 @@ ; V6: .eabi_attribute 6, 6 ; V6: .eabi_attribute 8, 1 ;; We assume round-to-nearest by default (matches GCC) +; V6-NOT: .eabi_attribute 27 +; V6-NOT: .eabi_attribute 36 +; V6-NOT: .eabi_attribute 42 +; V6-NOT: .eabi_attribute 44 +; V6-NOT: .eabi_attribute 68 ; V6-NOT: .eabi_attribute 19 ;; The default choice made by llc is for a V6 CPU without an FPU. ;; This is not an interesting detail, but for such CPUs, the default intention is to use @@ -242,13 +247,8 @@ ; V6: .eabi_attribute 23, 3 ; V6: .eabi_attribute 24, 1 ; V6: .eabi_attribute 25, 1 -; V6-NOT: .eabi_attribute 27 ; V6-NOT: .eabi_attribute 28 -; V6-NOT: .eabi_attribute 36 ; V6: .eabi_attribute 38, 1 -; V6-NOT: .eabi_attribute 42 -; V6-NOT: .eabi_attribute 44 -; V6-NOT: .eabi_attribute 68 ; V6-FAST-NOT: .eabi_attribute 19 ;; Despite the V6 CPU having no FPU by default, we chose to flush to @@ -262,9 +262,14 @@ ;; We emit 6, 12 for both v6-M and v6S-M, technically this is incorrect for ;; V6-M, however we don't model the OS extension so this is fine. ; V6M: .eabi_attribute 6, 12 -; V6M-NOT: .eabi_attribute 7 +; V6M: .eabi_attribute 7, 77 ; V6M: .eabi_attribute 8, 0 ; V6M: .eabi_attribute 9, 1 +; V6M-NOT: .eabi_attribute 27 +; V6M-NOT: .eabi_attribute 36 +; V6M-NOT: .eabi_attribute 42 +; V6M-NOT: .eabi_attribute 44 +; V6M-NOT: .eabi_attribute 68 ; V6M-NOT: .eabi_attribute 19 ;; The default choice made by llc is for a V6M CPU without an FPU. ;; This is not an interesting detail, but for such CPUs, the default intention is to use @@ -276,13 +281,8 @@ ; V6M: .eabi_attribute 23, 3 ; V6M: .eabi_attribute 24, 1 ; V6M: .eabi_attribute 25, 1 -; V6M-NOT: .eabi_attribute 27 ; V6M-NOT: .eabi_attribute 28 -; V6M-NOT: .eabi_attribute 36 ; V6M: .eabi_attribute 38, 1 -; V6M-NOT: .eabi_attribute 42 -; V6M-NOT: .eabi_attribute 44 -; V6M-NOT: .eabi_attribute 68 ; V6M-FAST-NOT: .eabi_attribute 19 ;; Despite the V6M CPU having no FPU by default, we chose to flush to @@ -298,6 +298,11 @@ ; ARM1156T2F-S: .eabi_attribute 8, 1 ; ARM1156T2F-S: .eabi_attribute 9, 2 ; ARM1156T2F-S: .fpu vfpv2 +; ARM1156T2F-S-NOT: .eabi_attribute 27 +; ARM1156T2F-S-NOT: .eabi_attribute 36 +; ARM1156T2F-S-NOT: .eabi_attribute 42 +; ARM1156T2F-S-NOT: .eabi_attribute 44 +; ARM1156T2F-S-NOT: .eabi_attribute 68 ; ARM1156T2F-S-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; ARM1156T2F-S: .eabi_attribute 20, 1 @@ -306,13 +311,8 @@ ; ARM1156T2F-S: .eabi_attribute 23, 3 ; ARM1156T2F-S: .eabi_attribute 24, 1 ; ARM1156T2F-S: .eabi_attribute 25, 1 -; ARM1156T2F-S-NOT: .eabi_attribute 27 ; ARM1156T2F-S-NOT: .eabi_attribute 28 -; ARM1156T2F-S-NOT: .eabi_attribute 36 ; ARM1156T2F-S: .eabi_attribute 38, 1 -; ARM1156T2F-S-NOT: .eabi_attribute 42 -; ARM1156T2F-S-NOT: .eabi_attribute 44 -; ARM1156T2F-S-NOT: .eabi_attribute 68 ; ARM1156T2F-S-FAST-NOT: .eabi_attribute 19 ;; V6 cores default to flush to positive zero (value 0). Note that value 2 is also equally @@ -327,6 +327,11 @@ ; V7M: .eabi_attribute 7, 77 ; V7M: .eabi_attribute 8, 0 ; V7M: .eabi_attribute 9, 2 +; V7M-NOT: .eabi_attribute 27 +; V7M-NOT: .eabi_attribute 36 +; V7M-NOT: .eabi_attribute 42 +; V7M-NOT: .eabi_attribute 44 +; V7M-NOT: .eabi_attribute 68 ; V7M-NOT: .eabi_attribute 19 ;; The default choice made by llc is for a V7M CPU without an FPU. ;; This is not an interesting detail, but for such CPUs, the default intention is to use @@ -338,13 +343,8 @@ ; V7M: .eabi_attribute 23, 3 ; V7M: .eabi_attribute 24, 1 ; V7M: .eabi_attribute 25, 1 -; V7M-NOT: .eabi_attribute 27 ; V7M-NOT: .eabi_attribute 28 -; V7M-NOT: .eabi_attribute 36 ; V7M: .eabi_attribute 38, 1 -; V7M-NOT: .eabi_attribute 42 -; V7M-NOT: .eabi_attribute 44 -; V7M-NOT: .eabi_attribute 68 ; V7M-FAST-NOT: .eabi_attribute 19 ;; Despite the V7M CPU having no FPU by default, we chose to flush @@ -357,6 +357,11 @@ ; V7: .syntax unified ; V7: .eabi_attribute 6, 10 +; V7-NOT: .eabi_attribute 27 +; V7-NOT: .eabi_attribute 36 +; V7-NOT: .eabi_attribute 42 +; V7-NOT: .eabi_attribute 44 +; V7-NOT: .eabi_attribute 68 ; V7-NOT: .eabi_attribute 19 ;; In safe-maths mode we default to an IEEE 754 compliant choice. ; V7: .eabi_attribute 20, 1 @@ -365,13 +370,8 @@ ; V7: .eabi_attribute 23, 3 ; V7: .eabi_attribute 24, 1 ; V7: .eabi_attribute 25, 1 -; V7-NOT: .eabi_attribute 27 ; V7-NOT: .eabi_attribute 28 -; V7-NOT: .eabi_attribute 36 ; V7: .eabi_attribute 38, 1 -; V7-NOT: .eabi_attribute 42 -; V7-NOT: .eabi_attribute 44 -; V7-NOT: .eabi_attribute 68 ; V7-FAST-NOT: .eabi_attribute 19 ;; The default CPU does have an FPU and it must be VFPv3 or better, so it flushes @@ -386,6 +386,9 @@ ; V7VE: .eabi_attribute 7, 65 @ Tag_CPU_arch_profile ; V7VE: .eabi_attribute 8, 1 @ Tag_ARM_ISA_use ; V7VE: .eabi_attribute 9, 2 @ Tag_THUMB_ISA_use +; V7VE: .eabi_attribute 42, 1 @ Tag_MPextension_use +; V7VE: .eabi_attribute 44, 2 @ Tag_DIV_use +; V7VE: .eabi_attribute 68, 3 @ Tag_Virtualization_use ; V7VE: .eabi_attribute 17, 1 @ Tag_ABI_PCS_GOT_use ; V7VE: .eabi_attribute 20, 1 @ Tag_ABI_FP_denormal ; V7VE: .eabi_attribute 21, 1 @ Tag_ABI_FP_exceptions @@ -393,19 +396,16 @@ ; V7VE: .eabi_attribute 24, 1 @ Tag_ABI_align_needed ; V7VE: .eabi_attribute 25, 1 @ Tag_ABI_align_preserved ; V7VE: .eabi_attribute 38, 1 @ Tag_ABI_FP_16bit_format -; V7VE: .eabi_attribute 42, 1 @ Tag_MPextension_use -; V7VE: .eabi_attribute 44, 2 @ Tag_DIV_use -; V7VE: .eabi_attribute 68, 3 @ Tag_Virtualization_use ; V8: .syntax unified ; V8: .eabi_attribute 67, "2.09" ; V8: .eabi_attribute 6, 14 +; V8-NOT: .eabi_attribute 44 ; V8-NOT: .eabi_attribute 19 ; V8: .eabi_attribute 20, 1 ; V8: .eabi_attribute 21, 1 ; V8-NOT: .eabi_attribute 22 ; V8: .eabi_attribute 23, 3 -; V8-NOT: .eabi_attribute 44 ; V8-FAST-NOT: .eabi_attribute 19 ;; The default does have an FPU, and for V8-A, it flushes preserving sign. @@ -496,6 +496,30 @@ ; CORTEX-A7-FPUV4: .fpu vfpv4 ; CORTEX-A7-CHECK-NOT: .eabi_attribute 19 + +; Tag_FP_HP_extension +; CORTEX-A7-CHECK: .eabi_attribute 36, 1 +; CORTEX-A7-NOFPU-NOT: .eabi_attribute 36 +; CORTEX-A7-FPUV4: .eabi_attribute 36, 1 + +; Tag_MPextension_use +; CORTEX-A7-CHECK: .eabi_attribute 42, 1 +; CORTEX-A7-NOFPU: .eabi_attribute 42, 1 +; CORTEX-A7-FPUV4: .eabi_attribute 42, 1 + +; Tag_DIV_use +; CORTEX-A7-CHECK: .eabi_attribute 44, 2 +; CORTEX-A7-NOFPU: .eabi_attribute 44, 2 +; CORTEX-A7-FPUV4: .eabi_attribute 44, 2 + +; Tag_DSP_extension +; CORTEX-A7-CHECK-NOT: .eabi_attribute 46 + +; Tag_Virtualization_use +; CORTEX-A7-CHECK: .eabi_attribute 68, 3 +; CORTEX-A7-NOFPU: .eabi_attribute 68, 3 +; CORTEX-A7-FPUV4: .eabi_attribute 68, 3 + ; Tag_ABI_FP_denormal ;; We default to IEEE 754 compliance ; CORTEX-A7-CHECK: .eabi_attribute 20, 1 @@ -535,40 +559,20 @@ ; CORTEX-A7-NOFPU: .eabi_attribute 25, 1 ; CORTEX-A7-FPUV4: .eabi_attribute 25, 1 -; Tag_FP_HP_extension -; CORTEX-A7-CHECK: .eabi_attribute 36, 1 -; CORTEX-A7-NOFPU-NOT: .eabi_attribute 36 -; CORTEX-A7-FPUV4: .eabi_attribute 36, 1 - ; Tag_FP_16bit_format ; CORTEX-A7-CHECK: .eabi_attribute 38, 1 ; CORTEX-A7-NOFPU: .eabi_attribute 38, 1 ; CORTEX-A7-FPUV4: .eabi_attribute 38, 1 -; Tag_MPextension_use -; CORTEX-A7-CHECK: .eabi_attribute 42, 1 -; CORTEX-A7-NOFPU: .eabi_attribute 42, 1 -; CORTEX-A7-FPUV4: .eabi_attribute 42, 1 - -; Tag_DIV_use -; CORTEX-A7-CHECK: .eabi_attribute 44, 2 -; CORTEX-A7-NOFPU: .eabi_attribute 44, 2 -; CORTEX-A7-FPUV4: .eabi_attribute 44, 2 - -; Tag_DSP_extension -; CORTEX-A7-CHECK-NOT: .eabi_attribute 46 - -; Tag_Virtualization_use -; CORTEX-A7-CHECK: .eabi_attribute 68, 3 -; CORTEX-A7-NOFPU: .eabi_attribute 68, 3 -; CORTEX-A7-FPUV4: .eabi_attribute 68, 3 - ; CORTEX-A5-DEFAULT: .cpu cortex-a5 ; CORTEX-A5-DEFAULT: .eabi_attribute 6, 10 ; CORTEX-A5-DEFAULT: .eabi_attribute 7, 65 ; CORTEX-A5-DEFAULT: .eabi_attribute 8, 1 ; CORTEX-A5-DEFAULT: .eabi_attribute 9, 2 ; CORTEX-A5-DEFAULT: .fpu neon-vfpv4 +; CORTEX-A5-DEFAULT: .eabi_attribute 42, 1 +; CORTEX-A5-DEFAULT-NOT: .eabi_attribute 44 +; CORTEX-A5-DEFAULT: .eabi_attribute 68, 1 ; CORTEX-A5-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A5-DEFAULT: .eabi_attribute 20, 1 @@ -577,9 +581,6 @@ ; CORTEX-A5-DEFAULT: .eabi_attribute 23, 3 ; CORTEX-A5-DEFAULT: .eabi_attribute 24, 1 ; CORTEX-A5-DEFAULT: .eabi_attribute 25, 1 -; CORTEX-A5-DEFAULT: .eabi_attribute 42, 1 -; CORTEX-A5-DEFAULT-NOT: .eabi_attribute 44 -; CORTEX-A5-DEFAULT: .eabi_attribute 68, 1 ; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 19 ;; The A5 defaults to a VFPv4 FPU, so it flushed preserving the sign when -ffast-math @@ -595,6 +596,8 @@ ; CORTEX-A5-NONEON: .eabi_attribute 8, 1 ; CORTEX-A5-NONEON: .eabi_attribute 9, 2 ; CORTEX-A5-NONEON: .fpu vfpv4-d16 +; CORTEX-A5-NONEON: .eabi_attribute 42, 1 +; CORTEX-A5-NONEON: .eabi_attribute 68, 1 ;; We default to IEEE 754 compliance ; CORTEX-A5-NONEON: .eabi_attribute 20, 1 ; CORTEX-A5-NONEON: .eabi_attribute 21, 1 @@ -602,8 +605,6 @@ ; CORTEX-A5-NONEON: .eabi_attribute 23, 3 ; CORTEX-A5-NONEON: .eabi_attribute 24, 1 ; CORTEX-A5-NONEON: .eabi_attribute 25, 1 -; CORTEX-A5-NONEON: .eabi_attribute 42, 1 -; CORTEX-A5-NONEON: .eabi_attribute 68, 1 ; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 19 ;; The A5 defaults to a VFPv4 FPU, so it flushed preserving sign when -ffast-math @@ -619,6 +620,8 @@ ; CORTEX-A5-NOFPU: .eabi_attribute 8, 1 ; CORTEX-A5-NOFPU: .eabi_attribute 9, 2 ; CORTEX-A5-NOFPU-NOT: .fpu +; CORTEX-A5-NOFPU: .eabi_attribute 42, 1 +; CORTEX-A5-NOFPU: .eabi_attribute 68, 1 ; CORTEX-A5-NOFPU-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A5-NOFPU: .eabi_attribute 20, 1 @@ -627,8 +630,6 @@ ; CORTEX-A5-NOFPU: .eabi_attribute 23, 3 ; CORTEX-A5-NOFPU: .eabi_attribute 24, 1 ; CORTEX-A5-NOFPU: .eabi_attribute 25, 1 -; CORTEX-A5-NOFPU: .eabi_attribute 42, 1 -; CORTEX-A5-NOFPU: .eabi_attribute 68, 1 ; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 19 ;; Despite there being no FPU, we chose to flush to zero preserving @@ -645,6 +646,11 @@ ; CORTEX-A8-SOFT: .eabi_attribute 8, 1 ; CORTEX-A8-SOFT: .eabi_attribute 9, 2 ; CORTEX-A8-SOFT: .fpu neon +; CORTEX-A8-SOFT-NOT: .eabi_attribute 27 +; CORTEX-A8-SOFT-NOT: .eabi_attribute 36, 1 +; CORTEX-A8-SOFT-NOT: .eabi_attribute 42, 1 +; CORTEX-A8-SOFT-NOT: .eabi_attribute 44 +; CORTEX-A8-SOFT: .eabi_attribute 68, 1 ; CORTEX-A8-SOFT-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A8-SOFT: .eabi_attribute 20, 1 @@ -653,13 +659,8 @@ ; CORTEX-A8-SOFT: .eabi_attribute 23, 3 ; CORTEX-A8-SOFT: .eabi_attribute 24, 1 ; CORTEX-A8-SOFT: .eabi_attribute 25, 1 -; CORTEX-A8-SOFT-NOT: .eabi_attribute 27 ; CORTEX-A8-SOFT-NOT: .eabi_attribute 28 -; CORTEX-A8-SOFT-NOT: .eabi_attribute 36, 1 ; CORTEX-A8-SOFT: .eabi_attribute 38, 1 -; CORTEX-A8-SOFT-NOT: .eabi_attribute 42, 1 -; CORTEX-A8-SOFT-NOT: .eabi_attribute 44 -; CORTEX-A8-SOFT: .eabi_attribute 68, 1 ; CORTEX-A9-SOFT: .cpu cortex-a9 ; CORTEX-A9-SOFT: .eabi_attribute 6, 10 @@ -667,6 +668,11 @@ ; CORTEX-A9-SOFT: .eabi_attribute 8, 1 ; CORTEX-A9-SOFT: .eabi_attribute 9, 2 ; CORTEX-A9-SOFT: .fpu neon +; CORTEX-A9-SOFT-NOT: .eabi_attribute 27 +; CORTEX-A9-SOFT: .eabi_attribute 36, 1 +; CORTEX-A9-SOFT: .eabi_attribute 42, 1 +; CORTEX-A9-SOFT-NOT: .eabi_attribute 44 +; CORTEX-A9-SOFT: .eabi_attribute 68, 1 ; CORTEX-A9-SOFT-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A9-SOFT: .eabi_attribute 20, 1 @@ -675,13 +681,8 @@ ; CORTEX-A9-SOFT: .eabi_attribute 23, 3 ; CORTEX-A9-SOFT: .eabi_attribute 24, 1 ; CORTEX-A9-SOFT: .eabi_attribute 25, 1 -; CORTEX-A9-SOFT-NOT: .eabi_attribute 27 ; CORTEX-A9-SOFT-NOT: .eabi_attribute 28 -; CORTEX-A9-SOFT: .eabi_attribute 36, 1 ; CORTEX-A9-SOFT: .eabi_attribute 38, 1 -; CORTEX-A9-SOFT: .eabi_attribute 42, 1 -; CORTEX-A9-SOFT-NOT: .eabi_attribute 44 -; CORTEX-A9-SOFT: .eabi_attribute 68, 1 ; CORTEX-A8-SOFT-FAST-NOT: .eabi_attribute 19 ; CORTEX-A9-SOFT-FAST-NOT: .eabi_attribute 19 @@ -699,6 +700,10 @@ ; CORTEX-A8-HARD: .eabi_attribute 8, 1 ; CORTEX-A8-HARD: .eabi_attribute 9, 2 ; CORTEX-A8-HARD: .fpu neon +; CORTEX-A8-HARD-NOT: .eabi_attribute 27 +; CORTEX-A8-HARD-NOT: .eabi_attribute 36, 1 +; CORTEX-A8-HARD-NOT: .eabi_attribute 42, 1 +; CORTEX-A8-HARD: .eabi_attribute 68, 1 ; CORTEX-A8-HARD-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A8-HARD: .eabi_attribute 20, 1 @@ -707,12 +712,8 @@ ; CORTEX-A8-HARD: .eabi_attribute 23, 3 ; CORTEX-A8-HARD: .eabi_attribute 24, 1 ; CORTEX-A8-HARD: .eabi_attribute 25, 1 -; CORTEX-A8-HARD-NOT: .eabi_attribute 27 ; CORTEX-A8-HARD: .eabi_attribute 28, 1 -; CORTEX-A8-HARD-NOT: .eabi_attribute 36, 1 ; CORTEX-A8-HARD: .eabi_attribute 38, 1 -; CORTEX-A8-HARD-NOT: .eabi_attribute 42, 1 -; CORTEX-A8-HARD: .eabi_attribute 68, 1 @@ -722,6 +723,10 @@ ; CORTEX-A9-HARD: .eabi_attribute 8, 1 ; CORTEX-A9-HARD: .eabi_attribute 9, 2 ; CORTEX-A9-HARD: .fpu neon +; CORTEX-A9-HARD-NOT: .eabi_attribute 27 +; CORTEX-A9-HARD: .eabi_attribute 36, 1 +; CORTEX-A9-HARD: .eabi_attribute 42, 1 +; CORTEX-A9-HARD: .eabi_attribute 68, 1 ; CORTEX-A9-HARD-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A9-HARD: .eabi_attribute 20, 1 @@ -730,12 +735,8 @@ ; CORTEX-A9-HARD: .eabi_attribute 23, 3 ; CORTEX-A9-HARD: .eabi_attribute 24, 1 ; CORTEX-A9-HARD: .eabi_attribute 25, 1 -; CORTEX-A9-HARD-NOT: .eabi_attribute 27 ; CORTEX-A9-HARD: .eabi_attribute 28, 1 -; CORTEX-A9-HARD: .eabi_attribute 36, 1 ; CORTEX-A9-HARD: .eabi_attribute 38, 1 -; CORTEX-A9-HARD: .eabi_attribute 42, 1 -; CORTEX-A9-HARD: .eabi_attribute 68, 1 ; CORTEX-A8-HARD-FAST-NOT: .eabi_attribute 19 ;; The A8 defaults to a VFPv3 FPU, so it flushes preserving the sign when @@ -759,6 +760,9 @@ ; CORTEX-A12-DEFAULT: .eabi_attribute 8, 1 ; CORTEX-A12-DEFAULT: .eabi_attribute 9, 2 ; CORTEX-A12-DEFAULT: .fpu neon-vfpv4 +; CORTEX-A12-DEFAULT: .eabi_attribute 42, 1 +; CORTEX-A12-DEFAULT: .eabi_attribute 44, 2 +; CORTEX-A12-DEFAULT: .eabi_attribute 68, 3 ; CORTEX-A12-DEFAULT-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A12-DEFAULT: .eabi_attribute 20, 1 @@ -767,9 +771,6 @@ ; CORTEX-A12-DEFAULT: .eabi_attribute 23, 3 ; CORTEX-A12-DEFAULT: .eabi_attribute 24, 1 ; CORTEX-A12-DEFAULT: .eabi_attribute 25, 1 -; CORTEX-A12-DEFAULT: .eabi_attribute 42, 1 -; CORTEX-A12-DEFAULT: .eabi_attribute 44, 2 -; CORTEX-A12-DEFAULT: .eabi_attribute 68, 3 ; CORTEX-A12-DEFAULT-FAST-NOT: .eabi_attribute 19 ;; The A12 defaults to a VFPv3 FPU, so it flushes preserving the sign when @@ -785,6 +786,9 @@ ; CORTEX-A12-NOFPU: .eabi_attribute 8, 1 ; CORTEX-A12-NOFPU: .eabi_attribute 9, 2 ; CORTEX-A12-NOFPU-NOT: .fpu +; CORTEX-A12-NOFPU: .eabi_attribute 42, 1 +; CORTEX-A12-NOFPU: .eabi_attribute 44, 2 +; CORTEX-A12-NOFPU: .eabi_attribute 68, 3 ; CORTEX-A12-NOFPU-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A12-NOFPU: .eabi_attribute 20, 1 @@ -793,9 +797,6 @@ ; CORTEX-A12-NOFPU: .eabi_attribute 23, 3 ; CORTEX-A12-NOFPU: .eabi_attribute 24, 1 ; CORTEX-A12-NOFPU: .eabi_attribute 25, 1 -; CORTEX-A12-NOFPU: .eabi_attribute 42, 1 -; CORTEX-A12-NOFPU: .eabi_attribute 44, 2 -; CORTEX-A12-NOFPU: .eabi_attribute 68, 3 ; CORTEX-A12-NOFPU-FAST-NOT: .eabi_attribute 19 ;; Despite there being no FPU, we chose to flush to zero preserving @@ -812,6 +813,11 @@ ; CORTEX-A15: .eabi_attribute 8, 1 ; CORTEX-A15: .eabi_attribute 9, 2 ; CORTEX-A15: .fpu neon-vfpv4 +; CORTEX-A15-NOT: .eabi_attribute 27 +; CORTEX-A15: .eabi_attribute 36, 1 +; CORTEX-A15: .eabi_attribute 42, 1 +; CORTEX-A15: .eabi_attribute 44, 2 +; CORTEX-A15: .eabi_attribute 68, 3 ; CORTEX-A15-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A15: .eabi_attribute 20, 1 @@ -820,13 +826,8 @@ ; CORTEX-A15: .eabi_attribute 23, 3 ; CORTEX-A15: .eabi_attribute 24, 1 ; CORTEX-A15: .eabi_attribute 25, 1 -; CORTEX-A15-NOT: .eabi_attribute 27 ; CORTEX-A15-NOT: .eabi_attribute 28 -; CORTEX-A15: .eabi_attribute 36, 1 ; CORTEX-A15: .eabi_attribute 38, 1 -; CORTEX-A15: .eabi_attribute 42, 1 -; CORTEX-A15: .eabi_attribute 44, 2 -; CORTEX-A15: .eabi_attribute 68, 3 ; CORTEX-A15-FAST-NOT: .eabi_attribute 19 ;; The A15 defaults to a VFPv3 FPU, so it flushes preserving the sign when @@ -842,6 +843,9 @@ ; CORTEX-A17-DEFAULT: .eabi_attribute 8, 1 ; CORTEX-A17-DEFAULT: .eabi_attribute 9, 2 ; CORTEX-A17-DEFAULT: .fpu neon-vfpv4 +; CORTEX-A17-DEFAULT: .eabi_attribute 42, 1 +; CORTEX-A17-DEFAULT: .eabi_attribute 44, 2 +; CORTEX-A17-DEFAULT: .eabi_attribute 68, 3 ; CORTEX-A17-DEFAULT-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A17-DEFAULT: .eabi_attribute 20, 1 @@ -850,9 +854,6 @@ ; CORTEX-A17-DEFAULT: .eabi_attribute 23, 3 ; CORTEX-A17-DEFAULT: .eabi_attribute 24, 1 ; CORTEX-A17-DEFAULT: .eabi_attribute 25, 1 -; CORTEX-A17-DEFAULT: .eabi_attribute 42, 1 -; CORTEX-A17-DEFAULT: .eabi_attribute 44, 2 -; CORTEX-A17-DEFAULT: .eabi_attribute 68, 3 ; CORTEX-A17-FAST-NOT: .eabi_attribute 19 ;; The A17 defaults to a VFPv3 FPU, so it flushes preserving the sign when @@ -868,6 +869,9 @@ ; CORTEX-A17-NOFPU: .eabi_attribute 8, 1 ; CORTEX-A17-NOFPU: .eabi_attribute 9, 2 ; CORTEX-A17-NOFPU-NOT: .fpu +; CORTEX-A17-NOFPU: .eabi_attribute 42, 1 +; CORTEX-A17-NOFPU: .eabi_attribute 44, 2 +; CORTEX-A17-NOFPU: .eabi_attribute 68, 3 ; CORTEX-A17-NOFPU-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A17-NOFPU: .eabi_attribute 20, 1 @@ -876,9 +880,6 @@ ; CORTEX-A17-NOFPU: .eabi_attribute 23, 3 ; CORTEX-A17-NOFPU: .eabi_attribute 24, 1 ; CORTEX-A17-NOFPU: .eabi_attribute 25, 1 -; CORTEX-A17-NOFPU: .eabi_attribute 42, 1 -; CORTEX-A17-NOFPU: .eabi_attribute 44, 2 -; CORTEX-A17-NOFPU: .eabi_attribute 68, 3 ; CORTEX-A17-NOFPU-NOT: .eabi_attribute 19 ;; Despite there being no FPU, we chose to flush to zero preserving @@ -897,25 +898,25 @@ ; CORTEX-M0: .cpu cortex-m0 ; CORTEX-M0: .eabi_attribute 6, 12 -; CORTEX-M0-NOT: .eabi_attribute 7 +; CORTEX-M0: .eabi_attribute 7, 77 ; CORTEX-M0: .eabi_attribute 8, 0 ; CORTEX-M0: .eabi_attribute 9, 1 +; CORTEX-M0-NOT: .eabi_attribute 27 +; CORTEX-M0-NOT: .eabi_attribute 36 +; CORTEX-M0: .eabi_attribute 34, 0 +; CORTEX-M0-NOT: .eabi_attribute 42 +; CORTEX-M0-NOT: .eabi_attribute 44 +; CORTEX-M0-NOT: .eabi_attribute 68 ; CORTEX-M0-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M0: .eabi_attribute 20, 1 ; CORTEX-M0: .eabi_attribute 21, 1 ; CORTEX-M0-NOT: .eabi_attribute 22 ; CORTEX-M0: .eabi_attribute 23, 3 -; CORTEX-M0: .eabi_attribute 34, 0 ; CORTEX-M0: .eabi_attribute 24, 1 ; CORTEX-M0: .eabi_attribute 25, 1 -; CORTEX-M0-NOT: .eabi_attribute 27 ; CORTEX-M0-NOT: .eabi_attribute 28 -; CORTEX-M0-NOT: .eabi_attribute 36 ; CORTEX-M0: .eabi_attribute 38, 1 -; CORTEX-M0-NOT: .eabi_attribute 42 -; CORTEX-M0-NOT: .eabi_attribute 44 -; CORTEX-M0-NOT: .eabi_attribute 68 ; CORTEX-M0-FAST-NOT: .eabi_attribute 19 ;; Despite the M0 CPU having no FPU in this scenario, we chose to @@ -930,9 +931,14 @@ ; CORTEX-M0PLUS: .cpu cortex-m0plus ; CORTEX-M0PLUS: .eabi_attribute 6, 12 -; CORTEX-M0PLUS-NOT: .eabi_attribute 7 +; CORTEX-M0PLUS: .eabi_attribute 7, 77 ; CORTEX-M0PLUS: .eabi_attribute 8, 0 ; CORTEX-M0PLUS: .eabi_attribute 9, 1 +; CORTEX-M0PLUS-NOT: .eabi_attribute 27 +; CORTEX-M0PLUS-NOT: .eabi_attribute 36 +; CORTEX-M0PLUS-NOT: .eabi_attribute 42 +; CORTEX-M0PLUS-NOT: .eabi_attribute 44 +; CORTEX-M0PLUS-NOT: .eabi_attribute 68 ; CORTEX-M0PLUS-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M0PLUS: .eabi_attribute 20, 1 @@ -941,13 +947,8 @@ ; CORTEX-M0PLUS: .eabi_attribute 23, 3 ; CORTEX-M0PLUS: .eabi_attribute 24, 1 ; CORTEX-M0PLUS: .eabi_attribute 25, 1 -; CORTEX-M0PLUS-NOT: .eabi_attribute 27 ; CORTEX-M0PLUS-NOT: .eabi_attribute 28 -; CORTEX-M0PLUS-NOT: .eabi_attribute 36 ; CORTEX-M0PLUS: .eabi_attribute 38, 1 -; CORTEX-M0PLUS-NOT: .eabi_attribute 42 -; CORTEX-M0PLUS-NOT: .eabi_attribute 44 -; CORTEX-M0PLUS-NOT: .eabi_attribute 68 ; CORTEX-M0PLUS-FAST-NOT: .eabi_attribute 19 ;; Despite the M0+ CPU having no FPU in this scenario, we chose to @@ -962,9 +963,14 @@ ; CORTEX-M1: .cpu cortex-m1 ; CORTEX-M1: .eabi_attribute 6, 12 -; CORTEX-M1-NOT: .eabi_attribute 7 +; CORTEX-M1: .eabi_attribute 7, 77 ; CORTEX-M1: .eabi_attribute 8, 0 ; CORTEX-M1: .eabi_attribute 9, 1 +; CORTEX-M1-NOT: .eabi_attribute 27 +; CORTEX-M1-NOT: .eabi_attribute 36 +; CORTEX-M1-NOT: .eabi_attribute 42 +; CORTEX-M1-NOT: .eabi_attribute 44 +; CORTEX-M1-NOT: .eabi_attribute 68 ; CORTEX-M1-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M1: .eabi_attribute 20, 1 @@ -973,13 +979,8 @@ ; CORTEX-M1: .eabi_attribute 23, 3 ; CORTEX-M1: .eabi_attribute 24, 1 ; CORTEX-M1: .eabi_attribute 25, 1 -; CORTEX-M1-NOT: .eabi_attribute 27 ; CORTEX-M1-NOT: .eabi_attribute 28 -; CORTEX-M1-NOT: .eabi_attribute 36 ; CORTEX-M1: .eabi_attribute 38, 1 -; CORTEX-M1-NOT: .eabi_attribute 42 -; CORTEX-M1-NOT: .eabi_attribute 44 -; CORTEX-M1-NOT: .eabi_attribute 68 ; CORTEX-M1-FAST-NOT: .eabi_attribute 19 ;; Despite the M1 CPU having no FPU in this scenario, we chose to @@ -994,9 +995,13 @@ ; SC000: .cpu sc000 ; SC000: .eabi_attribute 6, 12 -; SC000-NOT: .eabi_attribute 7 +; SC000: .eabi_attribute 7, 77 ; SC000: .eabi_attribute 8, 0 ; SC000: .eabi_attribute 9, 1 +; SC000-NOT: .eabi_attribute 27 +; SC000-NOT: .eabi_attribute 42 +; SC000-NOT: .eabi_attribute 44 +; SC000-NOT: .eabi_attribute 68 ; SC000-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; SC000: .eabi_attribute 20, 1 @@ -1005,13 +1010,8 @@ ; SC000: .eabi_attribute 23, 3 ; SC000: .eabi_attribute 24, 1 ; SC000: .eabi_attribute 25, 1 -; SC000-NOT: .eabi_attribute 27 ; SC000-NOT: .eabi_attribute 28 -; SC000-NOT: .eabi_attribute 36 ; SC000: .eabi_attribute 38, 1 -; SC000-NOT: .eabi_attribute 42 -; SC000-NOT: .eabi_attribute 44 -; SC000-NOT: .eabi_attribute 68 ; SC000-FAST-NOT: .eabi_attribute 19 ;; Despite the SC000 CPU having no FPU in this scenario, we chose to @@ -1029,6 +1029,11 @@ ; CORTEX-M3: .eabi_attribute 7, 77 ; CORTEX-M3: .eabi_attribute 8, 0 ; CORTEX-M3: .eabi_attribute 9, 2 +; CORTEX-M3-NOT: .eabi_attribute 27 +; CORTEX-M3-NOT: .eabi_attribute 36 +; CORTEX-M3-NOT: .eabi_attribute 42 +; CORTEX-M3-NOT: .eabi_attribute 44 +; CORTEX-M3-NOT: .eabi_attribute 68 ; CORTEX-M3-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M3: .eabi_attribute 20, 1 @@ -1037,13 +1042,8 @@ ; CORTEX-M3: .eabi_attribute 23, 3 ; CORTEX-M3: .eabi_attribute 24, 1 ; CORTEX-M3: .eabi_attribute 25, 1 -; CORTEX-M3-NOT: .eabi_attribute 27 ; CORTEX-M3-NOT: .eabi_attribute 28 -; CORTEX-M3-NOT: .eabi_attribute 36 ; CORTEX-M3: .eabi_attribute 38, 1 -; CORTEX-M3-NOT: .eabi_attribute 42 -; CORTEX-M3-NOT: .eabi_attribute 44 -; CORTEX-M3-NOT: .eabi_attribute 68 ; CORTEX-M3-FAST-NOT: .eabi_attribute 19 ;; Despite there being no FPU, we chose to flush to zero preserving @@ -1059,6 +1059,11 @@ ; SC300: .eabi_attribute 7, 77 ; SC300: .eabi_attribute 8, 0 ; SC300: .eabi_attribute 9, 2 +; SC300-NOT: .eabi_attribute 27 +; SC300-NOT: .eabi_attribute 36 +; SC300-NOT: .eabi_attribute 42 +; SC300-NOT: .eabi_attribute 44 +; SC300-NOT: .eabi_attribute 68 ; SC300-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; SC300: .eabi_attribute 20, 1 @@ -1067,13 +1072,8 @@ ; SC300: .eabi_attribute 23, 3 ; SC300: .eabi_attribute 24, 1 ; SC300: .eabi_attribute 25, 1 -; SC300-NOT: .eabi_attribute 27 ; SC300-NOT: .eabi_attribute 28 -; SC300-NOT: .eabi_attribute 36 ; SC300: .eabi_attribute 38, 1 -; SC300-NOT: .eabi_attribute 42 -; SC300-NOT: .eabi_attribute 44 -; SC300-NOT: .eabi_attribute 68 ; SC300-FAST-NOT: .eabi_attribute 19 ;; Despite there being no FPU, we chose to flush to zero preserving @@ -1090,6 +1090,11 @@ ; CORTEX-M4-SOFT: .eabi_attribute 8, 0 ; CORTEX-M4-SOFT: .eabi_attribute 9, 2 ; CORTEX-M4-SOFT: .fpu fpv4-sp-d16 +; CORTEX-M4-SOFT: .eabi_attribute 27, 1 +; CORTEX-M4-SOFT: .eabi_attribute 36, 1 +; CORTEX-M4-SOFT-NOT: .eabi_attribute 42 +; CORTEX-M4-SOFT-NOT: .eabi_attribute 44 +; CORTEX-M4-SOFT-NOT: .eabi_attribute 68 ; CORTEX-M4-SOFT-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M4-SOFT: .eabi_attribute 20, 1 @@ -1098,13 +1103,8 @@ ; CORTEX-M4-SOFT: .eabi_attribute 23, 3 ; CORTEX-M4-SOFT: .eabi_attribute 24, 1 ; CORTEX-M4-SOFT: .eabi_attribute 25, 1 -; CORTEX-M4-SOFT: .eabi_attribute 27, 1 ; CORTEX-M4-SOFT-NOT: .eabi_attribute 28 -; CORTEX-M4-SOFT: .eabi_attribute 36, 1 ; CORTEX-M4-SOFT: .eabi_attribute 38, 1 -; CORTEX-M4-SOFT-NOT: .eabi_attribute 42 -; CORTEX-M4-SOFT-NOT: .eabi_attribute 44 -; CORTEX-M4-SOFT-NOT: .eabi_attribute 68 ; CORTEX-M4-SOFT-FAST-NOT: .eabi_attribute 19 ;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when @@ -1120,6 +1120,11 @@ ; CORTEX-M4-HARD: .eabi_attribute 8, 0 ; CORTEX-M4-HARD: .eabi_attribute 9, 2 ; CORTEX-M4-HARD: .fpu fpv4-sp-d16 +; CORTEX-M4-HARD: .eabi_attribute 27, 1 +; CORTEX-M4-HARD: .eabi_attribute 36, 1 +; CORTEX-M4-HARD-NOT: .eabi_attribute 42 +; CORTEX-M4-HARD-NOT: .eabi_attribute 44 +; CORTEX-M4-HARD-NOT: .eabi_attribute 68 ; CORTEX-M4-HARD-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M4-HARD: .eabi_attribute 20, 1 @@ -1128,13 +1133,8 @@ ; CORTEX-M4-HARD: .eabi_attribute 23, 3 ; CORTEX-M4-HARD: .eabi_attribute 24, 1 ; CORTEX-M4-HARD: .eabi_attribute 25, 1 -; CORTEX-M4-HARD: .eabi_attribute 27, 1 ; CORTEX-M4-HARD: .eabi_attribute 28, 1 -; CORTEX-M4-HARD: .eabi_attribute 36, 1 ; CORTEX-M4-HARD: .eabi_attribute 38, 1 -; CORTEX-M4-HARD-NOT: .eabi_attribute 42 -; CORTEX-M4-HARD-NOT: .eabi_attribute 44 -; CORTEX-M4-HARD-NOT: .eabi_attribute 68 ; CORTEX-M4-HARD-FAST-NOT: .eabi_attribute 19 ;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when @@ -1152,6 +1152,11 @@ ; CORTEX-M7-SOFT-NOT: .fpu ; CORTEX-M7-SINGLE: .fpu fpv5-sp-d16 ; CORTEX-M7-DOUBLE: .fpu fpv5-d16 +; CORTEX-M7-SOFT-NOT: .eabi_attribute 27 +; CORTEX-M7-SINGLE: .eabi_attribute 27, 1 +; CORTEX-M7-DOUBLE-NOT: .eabi_attribute 27 +; CORTEX-M7: .eabi_attribute 36, 1 +; CORTEX-M7-NOT: .eabi_attribute 44 ; CORTEX-M7: .eabi_attribute 17, 1 ; CORTEX-M7-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance @@ -1161,12 +1166,7 @@ ; CORTEX-M7: .eabi_attribute 23, 3 ; CORTEX-M7: .eabi_attribute 24, 1 ; CORTEX-M7: .eabi_attribute 25, 1 -; CORTEX-M7-SOFT-NOT: .eabi_attribute 27 -; CORTEX-M7-SINGLE: .eabi_attribute 27, 1 -; CORTEX-M7-DOUBLE-NOT: .eabi_attribute 27 -; CORTEX-M7: .eabi_attribute 36, 1 ; CORTEX-M7: .eabi_attribute 38, 1 -; CORTEX-M7-NOT: .eabi_attribute 44 ; CORTEX-M7: .eabi_attribute 14, 0 ; CORTEX-M7-NOFPU-FAST-NOT: .eabi_attribute 19 @@ -1186,6 +1186,10 @@ ; CORTEX-R4: .eabi_attribute 8, 1 ; CORTEX-R4: .eabi_attribute 9, 2 ; CORTEX-R4-NOT: .fpu vfpv3-d16 +; CORTEX-R4-NOT: .eabi_attribute 36 +; CORTEX-R4-NOT: .eabi_attribute 42 +; CORTEX-R4-NOT: .eabi_attribute 44 +; CORTEX-R4-NOT: .eabi_attribute 68 ; CORTEX-R4-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-R4: .eabi_attribute 20, 1 @@ -1195,11 +1199,7 @@ ; CORTEX-R4: .eabi_attribute 24, 1 ; CORTEX-R4: .eabi_attribute 25, 1 ; CORTEX-R4-NOT: .eabi_attribute 28 -; CORTEX-R4-NOT: .eabi_attribute 36 ; CORTEX-R4: .eabi_attribute 38, 1 -; CORTEX-R4-NOT: .eabi_attribute 42 -; CORTEX-R4-NOT: .eabi_attribute 44 -; CORTEX-R4-NOT: .eabi_attribute 68 ; CORTEX-R4F: .cpu cortex-r4f ; CORTEX-R4F: .eabi_attribute 6, 10 @@ -1207,6 +1207,11 @@ ; CORTEX-R4F: .eabi_attribute 8, 1 ; CORTEX-R4F: .eabi_attribute 9, 2 ; CORTEX-R4F: .fpu vfpv3-d16 +; CORTEX-R4F-NOT: .eabi_attribute 27, 1 +; CORTEX-R4F-NOT: .eabi_attribute 36 +; CORTEX-R4F-NOT: .eabi_attribute 42 +; CORTEX-R4F-NOT: .eabi_attribute 44 +; CORTEX-R4F-NOT: .eabi_attribute 68 ; CORTEX-R4F-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-R4F: .eabi_attribute 20, 1 @@ -1215,13 +1220,8 @@ ; CORTEX-R4F: .eabi_attribute 23, 3 ; CORTEX-R4F: .eabi_attribute 24, 1 ; CORTEX-R4F: .eabi_attribute 25, 1 -; CORTEX-R4F-NOT: .eabi_attribute 27, 1 ; CORTEX-R4F-NOT: .eabi_attribute 28 -; CORTEX-R4F-NOT: .eabi_attribute 36 ; CORTEX-R4F: .eabi_attribute 38, 1 -; CORTEX-R4F-NOT: .eabi_attribute 42 -; CORTEX-R4F-NOT: .eabi_attribute 44 -; CORTEX-R4F-NOT: .eabi_attribute 68 ; CORTEX-R5: .cpu cortex-r5 ; CORTEX-R5: .eabi_attribute 6, 10 @@ -1229,6 +1229,11 @@ ; CORTEX-R5: .eabi_attribute 8, 1 ; CORTEX-R5: .eabi_attribute 9, 2 ; CORTEX-R5: .fpu vfpv3-d16 +; CORTEX-R5-NOT: .eabi_attribute 27, 1 +; CORTEX-R5-NOT: .eabi_attribute 36 +; CORTEX-R5: .eabi_attribute 44, 2 +; CORTEX-R5-NOT: .eabi_attribute 42 +; CORTEX-R5-NOT: .eabi_attribute 68 ; CORTEX-R5-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-R5: .eabi_attribute 20, 1 @@ -1237,13 +1242,8 @@ ; CORTEX-R5: .eabi_attribute 23, 3 ; CORTEX-R5: .eabi_attribute 24, 1 ; CORTEX-R5: .eabi_attribute 25, 1 -; CORTEX-R5-NOT: .eabi_attribute 27, 1 ; CORTEX-R5-NOT: .eabi_attribute 28 -; CORTEX-R5-NOT: .eabi_attribute 36 ; CORTEX-R5: .eabi_attribute 38, 1 -; CORTEX-R5-NOT: .eabi_attribute 42 -; CORTEX-R5: .eabi_attribute 44, 2 -; CORTEX-R5-NOT: .eabi_attribute 68 ; CORTEX-R5-FAST-NOT: .eabi_attribute 19 ;; The R5 has the VFPv3 FP unit, which always flushes preserving sign. @@ -1258,6 +1258,10 @@ ; CORTEX-R7: .eabi_attribute 8, 1 ; CORTEX-R7: .eabi_attribute 9, 2 ; CORTEX-R7: .fpu vfpv3-d16-fp16 +; CORTEX-R7: .eabi_attribute 36, 1 +; CORTEX-R7: .eabi_attribute 42, 1 +; CORTEX-R7: .eabi_attribute 44, 2 +; CORTEX-R7-NOT: .eabi_attribute 68 ; CORTEX-R7-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-R7: .eabi_attribute 20, 1 @@ -1267,11 +1271,7 @@ ; CORTEX-R7: .eabi_attribute 24, 1 ; CORTEX-R7: .eabi_attribute 25, 1 ; CORTEX-R7-NOT: .eabi_attribute 28 -; CORTEX-R7: .eabi_attribute 36, 1 ; CORTEX-R7: .eabi_attribute 38, 1 -; CORTEX-R7: .eabi_attribute 42, 1 -; CORTEX-R7: .eabi_attribute 44, 2 -; CORTEX-R7-NOT: .eabi_attribute 68 ; CORTEX-R7-FAST-NOT: .eabi_attribute 19 ;; The R7 has the VFPv3 FP unit, which always flushes preserving sign. @@ -1286,6 +1286,10 @@ ; CORTEX-R8: .eabi_attribute 8, 1 ; CORTEX-R8: .eabi_attribute 9, 2 ; CORTEX-R8: .fpu vfpv3-d16-fp16 +; CORTEX-R8: .eabi_attribute 36, 1 +; CORTEX-R8: .eabi_attribute 42, 1 +; CORTEX-R8: .eabi_attribute 44, 2 +; CORTEX-R8-NOT: .eabi_attribute 68 ; CORTEX-R8-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-R8: .eabi_attribute 20, 1 @@ -1295,11 +1299,7 @@ ; CORTEX-R8: .eabi_attribute 24, 1 ; CORTEX-R8: .eabi_attribute 25, 1 ; CORTEX-R8-NOT: .eabi_attribute 28 -; CORTEX-R8: .eabi_attribute 36, 1 ; CORTEX-R8: .eabi_attribute 38, 1 -; CORTEX-R8: .eabi_attribute 42, 1 -; CORTEX-R8: .eabi_attribute 44, 2 -; CORTEX-R8-NOT: .eabi_attribute 68 ; CORTEX-R8-FAST-NOT: .eabi_attribute 19 ;; The R8 has the VFPv3 FP unit, which always flushes preserving sign. @@ -1315,6 +1315,11 @@ ; CORTEX-A32: .eabi_attribute 9, 2 ; CORTEX-A32: .fpu crypto-neon-fp-armv8 ; CORTEX-A32: .eabi_attribute 12, 3 +; CORTEX-A32-NOT: .eabi_attribute 27 +; CORTEX-A32: .eabi_attribute 36, 1 +; CORTEX-A32: .eabi_attribute 42, 1 +; CORTEX-A32-NOT: .eabi_attribute 44 +; CORTEX-A32: .eabi_attribute 68, 3 ; CORTEX-A32-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A32: .eabi_attribute 20, 1 @@ -1323,13 +1328,8 @@ ; CORTEX-A32: .eabi_attribute 23, 3 ; CORTEX-A32: .eabi_attribute 24, 1 ; CORTEX-A32: .eabi_attribute 25, 1 -; CORTEX-A32-NOT: .eabi_attribute 27 ; CORTEX-A32-NOT: .eabi_attribute 28 -; CORTEX-A32: .eabi_attribute 36, 1 ; CORTEX-A32: .eabi_attribute 38, 1 -; CORTEX-A32: .eabi_attribute 42, 1 -; CORTEX-A32-NOT: .eabi_attribute 44 -; CORTEX-A32: .eabi_attribute 68, 3 ; CORTEX-A32-FAST-NOT: .eabi_attribute 19 ;; The A32 has the ARMv8 FP unit, which always flushes preserving sign. @@ -1343,20 +1343,20 @@ ; CORTEX-M23: .eabi_attribute 7, 77 ; CORTEX-M23: .eabi_attribute 8, 0 ; CORTEX-M23: .eabi_attribute 9, 3 +; CORTEX-M23-NOT: .eabi_attribute 27 +; CORTEX-M23: .eabi_attribute 34, 1 +; CORTEX-M23-NOT: .eabi_attribute 44 ; CORTEX-M23: .eabi_attribute 17, 1 ;; We default to IEEE 754 compliance ; CORTEX-M23-NOT: .eabi_attribute 19 ; CORTEX-M23: .eabi_attribute 20, 1 ; CORTEX-M23: .eabi_attribute 21, 1 ; CORTEX-M23: .eabi_attribute 23, 3 -; CORTEX-M23: .eabi_attribute 34, 1 ; CORTEX-M23: .eabi_attribute 24, 1 -; CORTEX-M23-NOT: .eabi_attribute 27 ; CORTEX-M23-NOT: .eabi_attribute 28 ; CORTEX-M23: .eabi_attribute 25, 1 ; CORTEX-M23: .eabi_attribute 38, 1 ; CORTEX-M23: .eabi_attribute 14, 0 -; CORTEX-M23-NOT: .eabi_attribute 44 ; CORTEX-M33: .cpu cortex-m33 ; CORTEX-M33: .eabi_attribute 6, 17 @@ -1364,21 +1364,21 @@ ; CORTEX-M33: .eabi_attribute 8, 0 ; CORTEX-M33: .eabi_attribute 9, 3 ; CORTEX-M33: .fpu fpv5-sp-d16 +; CORTEX-M33: .eabi_attribute 27, 1 +; CORTEX-M33: .eabi_attribute 36, 1 +; CORTEX-M33-NOT: .eabi_attribute 44 +; CORTEX-M33: .eabi_attribute 46, 1 +; CORTEX-M33: .eabi_attribute 34, 1 ; CORTEX-M33: .eabi_attribute 17, 1 ;; We default to IEEE 754 compliance ; CORTEX-M23-NOT: .eabi_attribute 19 ; CORTEX-M33: .eabi_attribute 20, 1 ; CORTEX-M33: .eabi_attribute 21, 1 ; CORTEX-M33: .eabi_attribute 23, 3 -; CORTEX-M33: .eabi_attribute 34, 1 ; CORTEX-M33: .eabi_attribute 24, 1 ; CORTEX-M33: .eabi_attribute 25, 1 -; CORTEX-M33: .eabi_attribute 27, 1 ; CORTEX-M33-NOT: .eabi_attribute 28 -; CORTEX-M33: .eabi_attribute 36, 1 ; CORTEX-M33: .eabi_attribute 38, 1 -; CORTEX-M33: .eabi_attribute 46, 1 -; CORTEX-M33-NOT: .eabi_attribute 44 ; CORTEX-M33: .eabi_attribute 14, 0 ; CORTEX-M33-FAST-NOT: .eabi_attribute 19 @@ -1394,6 +1394,11 @@ ; CORTEX-A35: .eabi_attribute 9, 2 ; CORTEX-A35: .fpu crypto-neon-fp-armv8 ; CORTEX-A35: .eabi_attribute 12, 3 +; CORTEX-A35-NOT: .eabi_attribute 27 +; CORTEX-A35: .eabi_attribute 36, 1 +; CORTEX-A35: .eabi_attribute 42, 1 +; CORTEX-A35-NOT: .eabi_attribute 44 +; CORTEX-A35: .eabi_attribute 68, 3 ; CORTEX-A35-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A35: .eabi_attribute 20, 1 @@ -1402,13 +1407,8 @@ ; CORTEX-A35: .eabi_attribute 23, 3 ; CORTEX-A35: .eabi_attribute 24, 1 ; CORTEX-A35: .eabi_attribute 25, 1 -; CORTEX-A35-NOT: .eabi_attribute 27 ; CORTEX-A35-NOT: .eabi_attribute 28 -; CORTEX-A35: .eabi_attribute 36, 1 ; CORTEX-A35: .eabi_attribute 38, 1 -; CORTEX-A35: .eabi_attribute 42, 1 -; CORTEX-A35-NOT: .eabi_attribute 44 -; CORTEX-A35: .eabi_attribute 68, 3 ; CORTEX-A35-FAST-NOT: .eabi_attribute 19 ;; The A35 has the ARMv8 FP unit, which always flushes preserving sign. @@ -1424,6 +1424,11 @@ ; CORTEX-A53: .eabi_attribute 9, 2 ; CORTEX-A53: .fpu crypto-neon-fp-armv8 ; CORTEX-A53: .eabi_attribute 12, 3 +; CORTEX-A53-NOT: .eabi_attribute 27 +; CORTEX-A53: .eabi_attribute 36, 1 +; CORTEX-A53: .eabi_attribute 42, 1 +; CORTEX-A53-NOT: .eabi_attribute 44 +; CORTEX-A53: .eabi_attribute 68, 3 ; CORTEX-A53-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A53: .eabi_attribute 20, 1 @@ -1432,13 +1437,8 @@ ; CORTEX-A53: .eabi_attribute 23, 3 ; CORTEX-A53: .eabi_attribute 24, 1 ; CORTEX-A53: .eabi_attribute 25, 1 -; CORTEX-A53-NOT: .eabi_attribute 27 ; CORTEX-A53-NOT: .eabi_attribute 28 -; CORTEX-A53: .eabi_attribute 36, 1 ; CORTEX-A53: .eabi_attribute 38, 1 -; CORTEX-A53: .eabi_attribute 42, 1 -; CORTEX-A53-NOT: .eabi_attribute 44 -; CORTEX-A53: .eabi_attribute 68, 3 ; CORTEX-A53-FAST-NOT: .eabi_attribute 19 ;; The A53 has the ARMv8 FP unit, which always flushes preserving sign. @@ -1454,6 +1454,11 @@ ; CORTEX-A57: .eabi_attribute 9, 2 ; CORTEX-A57: .fpu crypto-neon-fp-armv8 ; CORTEX-A57: .eabi_attribute 12, 3 +; CORTEX-A57-NOT: .eabi_attribute 27 +; CORTEX-A57: .eabi_attribute 36, 1 +; CORTEX-A57: .eabi_attribute 42, 1 +; CORTEX-A57-NOT: .eabi_attribute 44 +; CORTEX-A57: .eabi_attribute 68, 3 ; CORTEX-A57-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A57: .eabi_attribute 20, 1 @@ -1462,13 +1467,8 @@ ; CORTEX-A57: .eabi_attribute 23, 3 ; CORTEX-A57: .eabi_attribute 24, 1 ; CORTEX-A57: .eabi_attribute 25, 1 -; CORTEX-A57-NOT: .eabi_attribute 27 ; CORTEX-A57-NOT: .eabi_attribute 28 -; CORTEX-A57: .eabi_attribute 36, 1 ; CORTEX-A57: .eabi_attribute 38, 1 -; CORTEX-A57: .eabi_attribute 42, 1 -; CORTEX-A57-NOT: .eabi_attribute 44 -; CORTEX-A57: .eabi_attribute 68, 3 ; CORTEX-A57-FAST-NOT: .eabi_attribute 19 ;; The A57 has the ARMv8 FP unit, which always flushes preserving sign. @@ -1484,6 +1484,11 @@ ; CORTEX-A72: .eabi_attribute 9, 2 ; CORTEX-A72: .fpu crypto-neon-fp-armv8 ; CORTEX-A72: .eabi_attribute 12, 3 +; CORTEX-A72-NOT: .eabi_attribute 27 +; CORTEX-A72: .eabi_attribute 36, 1 +; CORTEX-A72: .eabi_attribute 42, 1 +; CORTEX-A72-NOT: .eabi_attribute 44 +; CORTEX-A72: .eabi_attribute 68, 3 ; CORTEX-A72-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A72: .eabi_attribute 20, 1 @@ -1492,13 +1497,8 @@ ; CORTEX-A72: .eabi_attribute 23, 3 ; CORTEX-A72: .eabi_attribute 24, 1 ; CORTEX-A72: .eabi_attribute 25, 1 -; CORTEX-A72-NOT: .eabi_attribute 27 ; CORTEX-A72-NOT: .eabi_attribute 28 -; CORTEX-A72: .eabi_attribute 36, 1 ; CORTEX-A72: .eabi_attribute 38, 1 -; CORTEX-A72: .eabi_attribute 42, 1 -; CORTEX-A72-NOT: .eabi_attribute 44 -; CORTEX-A72: .eabi_attribute 68, 3 ; CORTEX-A72-FAST-NOT: .eabi_attribute 19 ;; The A72 has the ARMv8 FP unit, which always flushes preserving sign. @@ -1514,6 +1514,11 @@ ; CORTEX-A73: .eabi_attribute 9, 2 ; CORTEX-A73: .fpu crypto-neon-fp-armv8 ; CORTEX-A73: .eabi_attribute 12, 3 +; CORTEX-A73-NOT: .eabi_attribute 27 +; CORTEX-A73: .eabi_attribute 36, 1 +; CORTEX-A73: .eabi_attribute 42, 1 +; CORTEX-A73-NOT: .eabi_attribute 44 +; CORTEX-A73: .eabi_attribute 68, 3 ; CORTEX-A73-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-A73: .eabi_attribute 20, 1 @@ -1522,14 +1527,9 @@ ; CORTEX-A73: .eabi_attribute 23, 3 ; CORTEX-A73: .eabi_attribute 24, 1 ; CORTEX-A73: .eabi_attribute 25, 1 -; CORTEX-A73-NOT: .eabi_attribute 27 ; CORTEX-A73-NOT: .eabi_attribute 28 -; CORTEX-A73: .eabi_attribute 36, 1 ; CORTEX-A73: .eabi_attribute 38, 1 -; CORTEX-A73: .eabi_attribute 42, 1 -; CORTEX-A73-NOT: .eabi_attribute 44 ; CORTEX-A73: .eabi_attribute 14, 0 -; CORTEX-A73: .eabi_attribute 68, 3 ; EXYNOS-M1: .cpu exynos-m1 ; EXYNOS-M1: .eabi_attribute 6, 14 @@ -1538,6 +1538,11 @@ ; EXYNOS-M1: .eabi_attribute 9, 2 ; EXYNOS-M1: .fpu crypto-neon-fp-armv8 ; EXYNOS-M1: .eabi_attribute 12, 3 +; EXYNOS-M1-NOT: .eabi_attribute 27 +; EXYNOS-M1: .eabi_attribute 36, 1 +; EXYNOS-M1: .eabi_attribute 42, 1 +; EXYNOS-M1-NOT: .eabi_attribute 44 +; EXYNOS-M1: .eabi_attribute 68, 3 ; EXYNOS-M1-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; EXYNOS-M1: .eabi_attribute 20, 1 @@ -1546,13 +1551,8 @@ ; EXYNOS-M1: .eabi_attribute 23, 3 ; EXYNOS-M1: .eabi_attribute 24, 1 ; EXYNOS-M1: .eabi_attribute 25, 1 -; EXYNOS-M1-NOT: .eabi_attribute 27 ; EXYNOS-M1-NOT: .eabi_attribute 28 -; EXYNOS-M1: .eabi_attribute 36, 1 ; EXYNOS-M1: .eabi_attribute 38, 1 -; EXYNOS-M1: .eabi_attribute 42, 1 -; EXYNOS-M1-NOT: .eabi_attribute 44 -; EXYNOS-M1: .eabi_attribute 68, 3 ; EXYNOS-M1-FAST-NOT: .eabi_attribute 19 ;; The exynos-m1 has the ARMv8 FP unit, which always flushes preserving sign. @@ -1568,6 +1568,11 @@ ; EXYNOS-M2: .eabi_attribute 9, 2 ; EXYNOS-M2: .fpu crypto-neon-fp-armv8 ; EXYNOS-M2: .eabi_attribute 12, 3 +; EXYNOS-M2-NOT: .eabi_attribute 27 +; EXYNOS-M2: .eabi_attribute 36, 1 +; EXYNOS-M2: .eabi_attribute 42, 1 +; EXYNOS-M2-NOT: .eabi_attribute 44 +; EXYNOS-M2: .eabi_attribute 68, 3 ; EXYNOS-M2-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; EXYNOS-M2: .eabi_attribute 20, 1 @@ -1576,13 +1581,8 @@ ; EXYNOS-M2: .eabi_attribute 23, 3 ; EXYNOS-M2: .eabi_attribute 24, 1 ; EXYNOS-M2: .eabi_attribute 25, 1 -; EXYNOS-M2-NOT: .eabi_attribute 27 ; EXYNOS-M2-NOT: .eabi_attribute 28 -; EXYNOS-M2: .eabi_attribute 36, 1 ; EXYNOS-M2: .eabi_attribute 38, 1 -; EXYNOS-M2: .eabi_attribute 42, 1 -; EXYNOS-M2-NOT: .eabi_attribute 44 -; EXYNOS-M2: .eabi_attribute 68, 3 ; EXYNOS-M3: .cpu exynos-m3 ; EXYNOS-M3: .eabi_attribute 6, 14 @@ -1591,6 +1591,11 @@ ; EXYNOS-M3: .eabi_attribute 9, 2 ; EXYNOS-M3: .fpu crypto-neon-fp-armv8 ; EXYNOS-M3: .eabi_attribute 12, 3 +; EXYNOS-M3-NOT: .eabi_attribute 27 +; EXYNOS-M3: .eabi_attribute 36, 1 +; EXYNOS-M3: .eabi_attribute 42, 1 +; EXYNOS-M3-NOT: .eabi_attribute 44 +; EXYNOS-M3: .eabi_attribute 68, 3 ; EXYNOS-M3-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; EXYNOS-M3: .eabi_attribute 20, 1 @@ -1599,13 +1604,8 @@ ; EXYNOS-M3: .eabi_attribute 23, 3 ; EXYNOS-M3: .eabi_attribute 24, 1 ; EXYNOS-M3: .eabi_attribute 25, 1 -; EXYNOS-M3-NOT: .eabi_attribute 27 ; EXYNOS-M3-NOT: .eabi_attribute 28 -; EXYNOS-M3: .eabi_attribute 36, 1 ; EXYNOS-M3: .eabi_attribute 38, 1 -; EXYNOS-M3: .eabi_attribute 42, 1 -; EXYNOS-M3-NOT: .eabi_attribute 44 -; EXYNOS-M3: .eabi_attribute 68, 3 ; GENERIC-FPU-VFPV3-FP16: .fpu vfpv3-fp16 ; GENERIC-FPU-VFPV3-D16-FP16: .fpu vfpv3-d16-fp16 @@ -1619,6 +1619,11 @@ ; GENERIC-ARMV8_1-A: .eabi_attribute 9, 2 ; GENERIC-ARMV8_1-A: .fpu crypto-neon-fp-armv8 ; GENERIC-ARMV8_1-A: .eabi_attribute 12, 4 +; GENERIC-ARMV8_1-A-NOT: .eabi_attribute 27 +; GENERIC-ARMV8_1-A: .eabi_attribute 36, 1 +; GENERIC-ARMV8_1-A: .eabi_attribute 42, 1 +; GENERIC-ARMV8_1-A-NOT: .eabi_attribute 44 +; GENERIC-ARMV8_1-A: .eabi_attribute 68, 3 ; GENERIC-ARMV8_1-A-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; GENERIC-ARMV8_1-A: .eabi_attribute 20, 1 @@ -1627,13 +1632,8 @@ ; GENERIC-ARMV8_1-A: .eabi_attribute 23, 3 ; GENERIC-ARMV8_1-A: .eabi_attribute 24, 1 ; GENERIC-ARMV8_1-A: .eabi_attribute 25, 1 -; GENERIC-ARMV8_1-A-NOT: .eabi_attribute 27 ; GENERIC-ARMV8_1-A-NOT: .eabi_attribute 28 -; GENERIC-ARMV8_1-A: .eabi_attribute 36, 1 ; GENERIC-ARMV8_1-A: .eabi_attribute 38, 1 -; GENERIC-ARMV8_1-A: .eabi_attribute 42, 1 -; GENERIC-ARMV8_1-A-NOT: .eabi_attribute 44 -; GENERIC-ARMV8_1-A: .eabi_attribute 68, 3 ; GENERIC-ARMV8_1-A-FAST-NOT: .eabi_attribute 19 ;; GENERIC-ARMV8_1-A has the ARMv8 FP unit, which always flushes preserving sign. @@ -1670,23 +1670,16 @@ ; ARMv8R-SP-NOT: .eabi_attribute 12 ; ARMv8R-NEON: .fpu neon-fp-armv8 ; ARMv8R-NEON: .eabi_attribute 12, 3 @ Tag_Advanced_SIMD_arch -; ARMv8R: .eabi_attribute 17, 1 @ Tag_ABI_PCS_GOT_use -; ARMv8R: .eabi_attribute 20, 1 @ Tag_ABI_FP_denormal -; ARMv8R: .eabi_attribute 21, 1 @ Tag_ABI_FP_exceptions -; ARMv8R: .eabi_attribute 23, 3 @ Tag_ABI_FP_number_model -; ARMv8R: .eabi_attribute 34, 1 @ Tag_CPU_unaligned_access -; ARMv8R: .eabi_attribute 24, 1 @ Tag_ABI_align_needed -; ARMv8R: .eabi_attribute 25, 1 @ Tag_ABI_align_preserved ; ARMv8R-NOFPU-NOT: .eabi_attribute 27 ; ARMv8R-SP: .eabi_attribute 27, 1 @ Tag_ABI_HardFP_use ; ARMv8R-NEON-NOT: .eabi_attribute 27 ; ARMv8R-NOFPU-NOT: .eabi_attribute 36 ; ARMv8R-SP: .eabi_attribute 36, 1 @ Tag_FP_HP_extension ; ARMv8R-NEON: .eabi_attribute 36, 1 @ Tag_FP_HP_extension -; ARMv8R: .eabi_attribute 38, 1 @ Tag_ABI_FP_16bit_format ; ARMv8R: .eabi_attribute 42, 1 @ Tag_MPextension_use -; ARMv8R: .eabi_attribute 14, 0 @ Tag_ABI_PCS_R9_use ; ARMv8R: .eabi_attribute 68, 2 @ Tag_Virtualization_use +; ARMv8R: .eabi_attribute 38, 1 @ Tag_ABI_FP_16bit_format +; ARMv8R: .eabi_attribute 14, 0 @ Tag_ABI_PCS_R9_use define i32 @f(i64 %z) { ret i32 0 diff --git a/test/CodeGen/ARM/darwin-tls-preserved.ll b/test/CodeGen/ARM/darwin-tls-preserved.ll new file mode 100644 index 000000000000..4969fabfd9b3 --- /dev/null +++ b/test/CodeGen/ARM/darwin-tls-preserved.ll @@ -0,0 +1,24 @@ +; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 -arm-atomic-cfg-tidy=0 -o - %s | FileCheck %s + +@tls_var = thread_local global i32 0 + +; r9 and r12 can be live across the asm, but those get clobbered by the TLS +; access (in a different BB to order it). +define i32 @test_regs_preserved(i32* %ptr1, i32* %ptr2, i1 %tst1) { +; CHECK-LABEL: test_regs_preserved: +; CHECK: str {{.*}}, [sp +; CHECK: mov {{.*}}, r12 +entry: + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r10},~{r11},~{r13},~{lr}"() + br i1 %tst1, label %get_tls, label %done + +get_tls: + %val = load i32, i32* @tls_var + br label %done + +done: + %res = phi i32 [%val, %get_tls], [0, %entry] + store i32 42, i32* %ptr1 + store i32 42, i32* %ptr2 + ret i32 %res +} diff --git a/test/CodeGen/ARM/divmod-hwdiv.ll b/test/CodeGen/ARM/divmod-hwdiv.ll new file mode 100644 index 000000000000..4cc316ffa3ea --- /dev/null +++ b/test/CodeGen/ARM/divmod-hwdiv.ll @@ -0,0 +1,37 @@ +; The hwdiv subtarget feature should only influence thumb, not arm. +; RUN: llc < %s -mtriple=arm-gnueabi -mattr=+hwdiv | FileCheck %s -check-prefixes=ALL,AEABI-NOHWDIV +; RUN: llc < %s -mtriple=arm-gnueabi -mattr=-hwdiv | FileCheck %s -check-prefixes=ALL,AEABI-NOHWDIV +; RUN: llc < %s -mtriple=thumbv7-gnueabi -mattr=+hwdiv | FileCheck %s -check-prefixes=ALL,THUMB-HWDIV +; RUN: llc < %s -mtriple=thumbv7-gnueabi -mattr=-hwdiv | FileCheck %s -check-prefixes=ALL,AEABI-NOHWDIV + +; The hwdiv-arm subtarget feature should only influence arm, not thumb. +; RUN: llc < %s -mtriple=arm-gnueabi -mattr=+hwdiv-arm | FileCheck %s -check-prefixes=ALL,ARM-HWDIV +; RUN: llc < %s -mtriple=arm-gnueabi -mattr=-hwdiv-arm | FileCheck %s -check-prefixes=ALL,AEABI-NOHWDIV +; RUN: llc < %s -mtriple=thumbv7-gnueabi -mattr=+hwdiv-arm | FileCheck %s -check-prefixes=ALL,AEABI-NOHWDIV +; RUN: llc < %s -mtriple=thumbv7-gnueabi -mattr=-hwdiv-arm | FileCheck %s -check-prefixes=ALL,AEABI-NOHWDIV + +define arm_aapcscc i32 @test_i32_srem(i32 %x, i32 %y) { +; ALL-LABEL: test_i32_srem: +; ARM-HWDIV: sdiv [[Q:r[0-9]+]], r0, r1 +; ARM-HWDIV: mul [[P:r[0-9]+]], [[Q]], r1 +; ARM-HWDIV: sub r0, r0, [[P]] +; THUMB-HWDIV: sdiv [[Q:r[0-9]+]], r0, r1 +; THUMB-HWDIV: mls r0, [[Q]], r1, r0 +; AEABI-NOHWDIV: bl __aeabi_idivmod +; AEABI-NOHWDIV: mov r0, r1 + %r = srem i32 %x, %y + ret i32 %r +} + +define arm_aapcscc i32 @test_i32_urem(i32 %x, i32 %y) { +; ALL-LABEL: test_i32_urem: +; ARM-HWDIV: udiv [[Q:r[0-9]+]], r0, r1 +; ARM-HWDIV: mul [[P:r[0-9]+]], [[Q]], r1 +; ARM-HWDIV: sub r0, r0, [[P]] +; THUMB-HWDIV: udiv [[Q:r[0-9]+]], r0, r1 +; THUMB-HWDIV: mls r0, [[Q]], r1, r0 +; AEABI-NOHWDIV: bl __aeabi_uidivmod +; AEABI-NOHWDIV: mov r0, r1 + %r = urem i32 %x, %y + ret i32 %r +} diff --git a/test/CodeGen/ARM/fpoffset_overflow.mir b/test/CodeGen/ARM/fpoffset_overflow.mir new file mode 100644 index 000000000000..9c6cd931b153 --- /dev/null +++ b/test/CodeGen/ARM/fpoffset_overflow.mir @@ -0,0 +1,94 @@ +# RUN: llc -o - %s -mtriple=thumbv7-- -run-pass=stack-protector -run-pass=prologepilog | FileCheck %s +--- +# This should trigger an emergency spill in the register scavenger because the +# frame offset into the large argument is too large. +# CHECK-LABEL: name: func0 +# CHECK: t2STRi12 killed %r7, %sp, 0, 14, _ :: (store 4 into %stack.0) +# CHECK: %r7 = t2ADDri killed %sp, 4096, 14, _, _ +# CHECK: %r11 = t2LDRi12 killed %r7, 36, 14, _ :: (load 4) +# CHECK: %r7 = t2LDRi12 %sp, 0, 14, _ :: (load 4 from %stack.0) +name: func0 +tracksRegLiveness: true +fixedStack: + - { id: 0, offset: 4084, size: 4, alignment: 4, isImmutable: true, + isAliased: false } + - { id: 1, offset: -12, size: 4096, alignment: 4, isImmutable: false, + isAliased: false } +body: | + bb.0: + %r0 = IMPLICIT_DEF + %r1 = IMPLICIT_DEF + %r2 = IMPLICIT_DEF + %r3 = IMPLICIT_DEF + %r4 = IMPLICIT_DEF + %r5 = IMPLICIT_DEF + %r6 = IMPLICIT_DEF + %r8 = IMPLICIT_DEF + %r9 = IMPLICIT_DEF + %r10 = IMPLICIT_DEF + %r11 = IMPLICIT_DEF + %r12 = IMPLICIT_DEF + %lr = IMPLICIT_DEF + + %r11 = t2LDRi12 %fixed-stack.0, 0, 14, _ :: (load 4) + + KILL %r0 + KILL %r1 + KILL %r2 + KILL %r3 + KILL %r4 + KILL %r5 + KILL %r6 + KILL %r8 + KILL %r9 + KILL %r10 + KILL %r11 + KILL %r12 + KILL %lr +... +--- +# This should not trigger an emergency spill yet. +# CHECK-LABEL: name: func1 +# CHECK-NOT: t2STRi12 +# CHECK-NOT: t2ADDri +# CHECK: %r11 = t2LDRi12 %sp, 4092, 14, _ :: (load 4) +# CHECK-NOT: t2LDRi12 +name: func1 +tracksRegLiveness: true +fixedStack: + - { id: 0, offset: 4044, size: 4, alignment: 4, isImmutable: true, + isAliased: false } + - { id: 1, offset: -12, size: 4056, alignment: 4, isImmutable: false, + isAliased: false } +body: | + bb.0: + %r0 = IMPLICIT_DEF + %r1 = IMPLICIT_DEF + %r2 = IMPLICIT_DEF + %r3 = IMPLICIT_DEF + %r4 = IMPLICIT_DEF + %r5 = IMPLICIT_DEF + %r6 = IMPLICIT_DEF + %r8 = IMPLICIT_DEF + %r9 = IMPLICIT_DEF + %r10 = IMPLICIT_DEF + %r11 = IMPLICIT_DEF + %r12 = IMPLICIT_DEF + %lr = IMPLICIT_DEF + + %r11 = t2LDRi12 %fixed-stack.0, 0, 14, _ :: (load 4) + + KILL %r0 + KILL %r1 + KILL %r2 + KILL %r3 + KILL %r4 + KILL %r5 + KILL %r6 + KILL %r8 + KILL %r9 + KILL %r10 + KILL %r11 + KILL %r12 + KILL %lr +... diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index d874884dcb39..fb204debf612 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -30,10 +30,9 @@ entry: define void @t1(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t1: -; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] -; CHECK: adds r0, #15 -; CHECK: adds r1, #15 +; CHECK: movs [[INC:r[0-9]+]], #15 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1], [[INC]] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]] ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false) @@ -43,13 +42,15 @@ entry: define void @t2(i8* nocapture %C) nounwind { entry: ; CHECK-LABEL: t2: +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! +; CHECK: movs [[INC:r[0-9]+]], #32 +; CHECK: add.w r3, r0, #16 +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]] ; CHECK: movw [[REG2:r[0-9]+]], #16716 ; CHECK: movt [[REG2:r[0-9]+]], #72 -; CHECK: str [[REG2]], [r0, #32] -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]! -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! +; CHECK: str [[REG2]], [r0] ; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3] tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false) ret void } diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll index f6f8d5623509..b86874692aca 100644 --- a/test/CodeGen/ARM/memset-inline.ll +++ b/test/CodeGen/ARM/memset-inline.ll @@ -13,10 +13,10 @@ entry: define void @t2() nounwind ssp { entry: ; CHECK-LABEL: t2: -; CHECK: add.w r1, r0, #10 ; CHECK: vmov.i32 {{q[0-9]+}}, #0x0 -; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] -; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: movs r1, #10 +; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2], r1 +; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] %buf = alloca [26 x i8], align 1 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false) diff --git a/test/CodeGen/ARM/vbits.ll b/test/CodeGen/ARM/vbits.ll index db9bc6ccdd0c..0a7f7698fa88 100644 --- a/test/CodeGen/ARM/vbits.ll +++ b/test/CodeGen/ARM/vbits.ll @@ -1,8 +1,14 @@ -; RUN: llc -mtriple=arm-eabi -mattr=+neon -mcpu=cortex-a8 %s -o - | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-eabi -mattr=+neon -mcpu=cortex-a8 | FileCheck %s define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: v_andi8: -;CHECK: vand +; CHECK-LABEL: v_andi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = and <8 x i8> %tmp1, %tmp2 @@ -10,8 +16,13 @@ define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: v_andi16: -;CHECK: vand +; CHECK-LABEL: v_andi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = and <4 x i16> %tmp1, %tmp2 @@ -19,8 +30,13 @@ define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: v_andi32: -;CHECK: vand +; CHECK-LABEL: v_andi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = and <2 x i32> %tmp1, %tmp2 @@ -28,8 +44,13 @@ define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK-LABEL: v_andi64: -;CHECK: vand +; CHECK-LABEL: v_andi64: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = and <1 x i64> %tmp1, %tmp2 @@ -37,8 +58,14 @@ define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind { } define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: v_andQi8: -;CHECK: vand +; CHECK-LABEL: v_andQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = and <16 x i8> %tmp1, %tmp2 @@ -46,8 +73,14 @@ define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { } define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: v_andQi16: -;CHECK: vand +; CHECK-LABEL: v_andQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = and <8 x i16> %tmp1, %tmp2 @@ -55,8 +88,14 @@ define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: v_andQi32: -;CHECK: vand +; CHECK-LABEL: v_andQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = and <4 x i32> %tmp1, %tmp2 @@ -64,8 +103,14 @@ define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK-LABEL: v_andQi64: -;CHECK: vand +; CHECK-LABEL: v_andQi64: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B %tmp3 = and <2 x i64> %tmp1, %tmp2 @@ -73,8 +118,13 @@ define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { } define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: v_bici8: -;CHECK: vbic +; CHECK-LABEL: v_bici8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vbic d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > @@ -83,8 +133,13 @@ define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: v_bici16: -;CHECK: vbic +; CHECK-LABEL: v_bici16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vbic d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > @@ -93,8 +148,13 @@ define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: v_bici32: -;CHECK: vbic +; CHECK-LABEL: v_bici32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vbic d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > @@ -103,8 +163,13 @@ define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK-LABEL: v_bici64: -;CHECK: vbic +; CHECK-LABEL: v_bici64: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vbic d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > @@ -113,8 +178,14 @@ define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind { } define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: v_bicQi8: -;CHECK: vbic +; CHECK-LABEL: v_bicQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vbic q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > @@ -123,8 +194,14 @@ define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { } define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: v_bicQi16: -;CHECK: vbic +; CHECK-LABEL: v_bicQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vbic q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > @@ -133,8 +210,14 @@ define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: v_bicQi32: -;CHECK: vbic +; CHECK-LABEL: v_bicQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vbic q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > @@ -143,8 +226,14 @@ define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK-LABEL: v_bicQi64: -;CHECK: vbic +; CHECK-LABEL: v_bicQi64: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vbic q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > @@ -153,8 +242,13 @@ define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { } define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: v_eori8: -;CHECK: veor +; CHECK-LABEL: v_eori8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: veor d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = xor <8 x i8> %tmp1, %tmp2 @@ -162,8 +256,13 @@ define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: v_eori16: -;CHECK: veor +; CHECK-LABEL: v_eori16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: veor d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = xor <4 x i16> %tmp1, %tmp2 @@ -171,8 +270,13 @@ define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: v_eori32: -;CHECK: veor +; CHECK-LABEL: v_eori32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: veor d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = xor <2 x i32> %tmp1, %tmp2 @@ -180,8 +284,13 @@ define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK-LABEL: v_eori64: -;CHECK: veor +; CHECK-LABEL: v_eori64: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: veor d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = xor <1 x i64> %tmp1, %tmp2 @@ -189,8 +298,14 @@ define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind { } define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: v_eorQi8: -;CHECK: veor +; CHECK-LABEL: v_eorQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: veor q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = xor <16 x i8> %tmp1, %tmp2 @@ -198,8 +313,14 @@ define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { } define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: v_eorQi16: -;CHECK: veor +; CHECK-LABEL: v_eorQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: veor q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = xor <8 x i16> %tmp1, %tmp2 @@ -207,8 +328,14 @@ define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: v_eorQi32: -;CHECK: veor +; CHECK-LABEL: v_eorQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: veor q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = xor <4 x i32> %tmp1, %tmp2 @@ -216,8 +343,14 @@ define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK-LABEL: v_eorQi64: -;CHECK: veor +; CHECK-LABEL: v_eorQi64: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: veor q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B %tmp3 = xor <2 x i64> %tmp1, %tmp2 @@ -225,72 +358,113 @@ define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { } define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind { -;CHECK-LABEL: v_mvni8: -;CHECK: vmvn +; CHECK-LABEL: v_mvni8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmvn d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > ret <8 x i8> %tmp2 } define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind { -;CHECK-LABEL: v_mvni16: -;CHECK: vmvn +; CHECK-LABEL: v_mvni16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmvn d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 > ret <4 x i16> %tmp2 } define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind { -;CHECK-LABEL: v_mvni32: -;CHECK: vmvn +; CHECK-LABEL: v_mvni32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmvn d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 > ret <2 x i32> %tmp2 } define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind { -;CHECK-LABEL: v_mvni64: -;CHECK: vmvn +; CHECK-LABEL: v_mvni64: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmvn d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = xor <1 x i64> %tmp1, < i64 -1 > ret <1 x i64> %tmp2 } define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind { -;CHECK-LABEL: v_mvnQi8: -;CHECK: vmvn +; CHECK-LABEL: v_mvnQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmvn q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > ret <16 x i8> %tmp2 } define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind { -;CHECK-LABEL: v_mvnQi16: -;CHECK: vmvn +; CHECK-LABEL: v_mvnQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmvn q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > ret <8 x i16> %tmp2 } define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind { -;CHECK-LABEL: v_mvnQi32: -;CHECK: vmvn +; CHECK-LABEL: v_mvnQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmvn q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 > ret <4 x i32> %tmp2 } define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind { -;CHECK-LABEL: v_mvnQi64: -;CHECK: vmvn +; CHECK-LABEL: v_mvnQi64: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmvn q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 > ret <2 x i64> %tmp2 } define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: v_orri8: -;CHECK: vorr +; CHECK-LABEL: v_orri8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorr d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = or <8 x i8> %tmp1, %tmp2 @@ -298,8 +472,13 @@ define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: v_orri16: -;CHECK: vorr +; CHECK-LABEL: v_orri16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorr d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = or <4 x i16> %tmp1, %tmp2 @@ -307,8 +486,13 @@ define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: v_orri32: -;CHECK: vorr +; CHECK-LABEL: v_orri32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorr d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = or <2 x i32> %tmp1, %tmp2 @@ -316,8 +500,13 @@ define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK-LABEL: v_orri64: -;CHECK: vorr +; CHECK-LABEL: v_orri64: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorr d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = or <1 x i64> %tmp1, %tmp2 @@ -325,8 +514,14 @@ define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind { } define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: v_orrQi8: -;CHECK: vorr +; CHECK-LABEL: v_orrQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorr q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = or <16 x i8> %tmp1, %tmp2 @@ -334,8 +529,14 @@ define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { } define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: v_orrQi16: -;CHECK: vorr +; CHECK-LABEL: v_orrQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorr q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = or <8 x i16> %tmp1, %tmp2 @@ -343,8 +544,14 @@ define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: v_orrQi32: -;CHECK: vorr +; CHECK-LABEL: v_orrQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorr q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = or <4 x i32> %tmp1, %tmp2 @@ -352,8 +559,14 @@ define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK-LABEL: v_orrQi64: -;CHECK: vorr +; CHECK-LABEL: v_orrQi64: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorr q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B %tmp3 = or <2 x i64> %tmp1, %tmp2 @@ -361,8 +574,13 @@ define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { } define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: v_orni8: -;CHECK: vorn +; CHECK-LABEL: v_orni8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorn d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > @@ -371,8 +589,13 @@ define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: v_orni16: -;CHECK: vorn +; CHECK-LABEL: v_orni16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorn d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > @@ -381,8 +604,13 @@ define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: v_orni32: -;CHECK: vorn +; CHECK-LABEL: v_orni32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorn d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > @@ -391,8 +619,13 @@ define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK-LABEL: v_orni64: -;CHECK: vorn +; CHECK-LABEL: v_orni64: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vorn d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > @@ -401,8 +634,14 @@ define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind { } define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: v_ornQi8: -;CHECK: vorn +; CHECK-LABEL: v_ornQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorn q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > @@ -411,8 +650,14 @@ define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { } define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: v_ornQi16: -;CHECK: vorn +; CHECK-LABEL: v_ornQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorn q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > @@ -421,8 +666,14 @@ define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: v_ornQi32: -;CHECK: vorn +; CHECK-LABEL: v_ornQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorn q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > @@ -431,8 +682,14 @@ define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK-LABEL: v_ornQi64: -;CHECK: vorn +; CHECK-LABEL: v_ornQi64: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorn q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > @@ -441,8 +698,13 @@ define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { } define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vtsti8: -;CHECK: vtst.8 +; CHECK-LABEL: vtsti8: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtst.8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = and <8 x i8> %tmp1, %tmp2 @@ -452,8 +714,13 @@ define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: vtsti16: -;CHECK: vtst.16 +; CHECK-LABEL: vtsti16: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtst.16 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = and <4 x i16> %tmp1, %tmp2 @@ -463,8 +730,13 @@ define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: vtsti32: -;CHECK: vtst.32 +; CHECK-LABEL: vtsti32: +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtst.32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = and <2 x i32> %tmp1, %tmp2 @@ -474,8 +746,14 @@ define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: vtstQi8: -;CHECK: vtst.8 +; CHECK-LABEL: vtstQi8: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtst.8 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = and <16 x i8> %tmp1, %tmp2 @@ -485,8 +763,14 @@ define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { } define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vtstQi16: -;CHECK: vtst.16 +; CHECK-LABEL: vtstQi16: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtst.16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = and <8 x i16> %tmp1, %tmp2 @@ -496,8 +780,14 @@ define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: vtstQi32: -;CHECK: vtst.32 +; CHECK-LABEL: vtstQi32: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vtst.32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = and <4 x i32> %tmp1, %tmp2 @@ -508,19 +798,24 @@ define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind { ; CHECK-LABEL: v_orrimm: -; CHECK-NOT: vmov -; CHECK-NOT: vmvn -; CHECK: vorr +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vorr.i32 d16, #0x1000000 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp3 = or <8 x i8> %tmp1, ret <8 x i8> %tmp3 } define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind { -; CHECK: v_orrimmQ -; CHECK-NOT: vmov -; CHECK-NOT: vmvn -; CHECK: vorr +; CHECK-LABEL: v_orrimmQ: +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vorr.i32 q8, #0x1000000 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp3 = or <16 x i8> %tmp1, ret <16 x i8> %tmp3 @@ -528,9 +823,11 @@ define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind { define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind { ; CHECK-LABEL: v_bicimm: -; CHECK-NOT: vmov -; CHECK-NOT: vmvn -; CHECK: vbic +; CHECK: @ BB#0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vbic.i32 d16, #0xff000000 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 > ret <8 x i8> %tmp3 @@ -538,10 +835,29 @@ define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind { define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind { ; CHECK-LABEL: v_bicimmQ: -; CHECK-NOT: vmov -; CHECK-NOT: vmvn -; CHECK: vbic +; CHECK: @ BB#0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vbic.i32 q8, #0xff000000 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 > ret <16 x i8> %tmp3 } + +define <4 x i32> @hidden_not_v4i32(<4 x i32> %x) nounwind { +; CHECK-LABEL: hidden_not_v4i32: +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d19, r2, r3 +; CHECK-NEXT: vmov.i32 q8, #0x6 +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vbic q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %xor = xor <4 x i32> %x, + %and = and <4 x i32> %xor, + ret <4 x i32> %and +} + diff --git a/test/CodeGen/ARM/vector-load.ll b/test/CodeGen/ARM/vector-load.ll index ed734723a86d..4f7ebc938d4c 100644 --- a/test/CodeGen/ARM/vector-load.ll +++ b/test/CodeGen/ARM/vector-load.ll @@ -253,11 +253,22 @@ define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) { } ; CHECK-LABEL: test_silly_load: -; CHECK: ldr {{r[0-9]+}}, [r0, #24] -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]! -; CHECK: vldr d{{[0-9]+}}, [r0] +; CHECK: vldr d{{[0-9]+}}, [r0, #16] +; CHECK: movs r1, #24 +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1 +; CHECK: ldr {{r[0-9]+}}, [r0] define void @test_silly_load(<28 x i8>* %addr) { load volatile <28 x i8>, <28 x i8>* %addr ret void } + +define <4 x i32>* @test_vld1_immoffset(<4 x i32>* %ptr.in, <4 x i32>* %ptr.out) { +; CHECK-LABEL: test_vld1_immoffset: +; CHECK: movs [[INC:r[0-9]+]], #32 +; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0], [[INC]] + %val = load <4 x i32>, <4 x i32>* %ptr.in + store <4 x i32> %val, <4 x i32>* %ptr.out + %next = getelementptr <4 x i32>, <4 x i32>* %ptr.in, i32 2 + ret <4 x i32>* %next +} diff --git a/test/CodeGen/ARM/vector-store.ll b/test/CodeGen/ARM/vector-store.ll index 161bbf1d0fde..e8c1a78a9113 100644 --- a/test/CodeGen/ARM/vector-store.ll +++ b/test/CodeGen/ARM/vector-store.ll @@ -256,3 +256,13 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val) store <4 x i8>* %inc, <4 x i8>** %ptr ret void } + +define <4 x i32>* @test_vst1_1reg(<4 x i32>* %ptr.in, <4 x i32>* %ptr.out) { +; CHECK-LABEL: test_vst1_1reg: +; CHECK: movs [[INC:r[0-9]+]], #32 +; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r1], [[INC]] + %val = load <4 x i32>, <4 x i32>* %ptr.in + store <4 x i32> %val, <4 x i32>* %ptr.out + %next = getelementptr <4 x i32>, <4 x i32>* %ptr.out, i32 2 + ret <4 x i32>* %next +} diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll index c6d5747f3509..71ca0f791524 100644 --- a/test/CodeGen/ARM/vlddup.ll +++ b/test/CodeGen/ARM/vlddup.ll @@ -310,6 +310,23 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind { ret <4 x i16> %tmp5 } +define <4 x i16> @vld2dupi16_odd_update(i16** %ptr) nounwind { +;CHECK-LABEL: vld2dupi16_odd_update: +;CHECK: mov [[INC:r[0-9]+]], #6 +;CHECK: vld2.16 {d16[], d17[]}, [r1], [[INC]] + %A = load i16*, i16** %ptr + %A2 = bitcast i16* %A to i8* + %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) + %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer + %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 + %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer + %tmp5 = add <4 x i16> %tmp2, %tmp4 + %tmp6 = getelementptr i16, i16* %A, i32 3 + store i16* %tmp6, i16** %ptr + ret <4 x i16> %tmp5 +} + define <2 x i32> @vld2dupi32(i8* %A) nounwind { ;CHECK-LABEL: vld2dupi32: ;Check the alignment value. Max for this instruction is 64 bits: diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll index 2c14bc2d8f4e..866641f3fbbd 100644 --- a/test/CodeGen/ARM/vldlane.ll +++ b/test/CodeGen/ARM/vldlane.ll @@ -150,6 +150,22 @@ define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind { ret <2 x i32> %tmp5 } +define <2 x i32> @vld2lanei32_odd_update(i32** %ptr, <2 x i32>* %B) nounwind { +;CHECK-LABEL: vld2lanei32_odd_update: +;CHECK: mov [[INC:r[0-9]+]], #12 +;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}], [[INC]] + %A = load i32*, i32** %ptr + %tmp0 = bitcast i32* %A to i8* + %tmp1 = load <2 x i32>, <2 x i32>* %B + %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) + %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 + %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 + %tmp5 = add <2 x i32> %tmp3, %tmp4 + %tmp6 = getelementptr i32, i32* %A, i32 3 + store i32* %tmp6, i32** %ptr + ret <2 x i32> %tmp5 +} + define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { ;CHECK-LABEL: vld2lanef: ;CHECK: vld2.32 diff --git a/test/CodeGen/ARM/vtbl.ll b/test/CodeGen/ARM/vtbl.ll index e4dd572a41b4..2e0718877e96 100644 --- a/test/CodeGen/ARM/vtbl.ll +++ b/test/CodeGen/ARM/vtbl.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s +; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - -verify-machineinstrs | FileCheck %s %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } diff --git a/test/CodeGen/AVR/alloca.ll b/test/CodeGen/AVR/alloca.ll index 579573c0a133..37c0e62b55fd 100644 --- a/test/CodeGen/AVR/alloca.ll +++ b/test/CodeGen/AVR/alloca.ll @@ -45,14 +45,14 @@ entry: define i16 @alloca_write(i16 %x) { entry: ; CHECK-LABEL: alloca_write: +; Small offset here +; CHECK: std Y+23, {{.*}} +; CHECK: std Y+24, {{.*}} ; Big offset here ; CHECK: adiw r28, 57 ; CHECK: std Y+62, {{.*}} ; CHECK: std Y+63, {{.*}} ; CHECK: sbiw r28, 57 -; Small offset here -; CHECK: std Y+23, {{.*}} -; CHECK: std Y+24, {{.*}} %p = alloca [15 x i16] %k = alloca [14 x i16] %arrayidx = getelementptr inbounds [15 x i16], [15 x i16]* %p, i16 0, i16 45 diff --git a/test/CodeGen/AVR/call.ll b/test/CodeGen/AVR/call.ll index 58bffd3a6787..bc6cb198a9e5 100644 --- a/test/CodeGen/AVR/call.ll +++ b/test/CodeGen/AVR/call.ll @@ -30,9 +30,9 @@ define i8 @calli8_reg() { define i8 @calli8_stack() { ; CHECK-LABEL: calli8_stack: -; CHECK: ldi [[REG1:r[0-9]+]], 11 +; CHECK: ldi [[REG1:r[0-9]+]], 10 ; CHECK: push [[REG1]] -; CHECK: ldi [[REG1]], 10 +; CHECK: ldi [[REG1]], 11 ; CHECK: push [[REG1]] ; CHECK: call foo8_3 %result1 = call i8 @foo8_3(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11) @@ -52,14 +52,14 @@ define i16 @calli16_reg() { define i16 @calli16_stack() { ; CHECK-LABEL: calli16_stack: -; CHECK: ldi [[REG1:r[0-9]+]], 10 -; CHECK: ldi [[REG2:r[0-9]+]], 2 -; CHECK: push [[REG2]] -; CHECK: push [[REG1]] ; CHECK: ldi [[REG1:r[0-9]+]], 9 ; CHECK: ldi [[REG2:r[0-9]+]], 2 ; CHECK: push [[REG2]] ; CHECK: push [[REG1]] +; CHECK: ldi [[REG1:r[0-9]+]], 10 +; CHECK: ldi [[REG2:r[0-9]+]], 2 +; CHECK: push [[REG2]] +; CHECK: push [[REG1]] ; CHECK: call foo16_2 %result1 = call i16 @foo16_2(i16 512, i16 513, i16 514, i16 515, i16 516, i16 517, i16 518, i16 519, i16 520, i16 521, i16 522) ret i16 %result1 @@ -82,14 +82,14 @@ define i32 @calli32_reg() { define i32 @calli32_stack() { ; CHECK-LABEL: calli32_stack: -; CHECK: ldi [[REG1:r[0-9]+]], 15 -; CHECK: ldi [[REG2:r[0-9]+]], 2 -; CHECK: push [[REG2]] -; CHECK: push [[REG1]] ; CHECK: ldi [[REG1:r[0-9]+]], 64 ; CHECK: ldi [[REG2:r[0-9]+]], 66 ; CHECK: push [[REG2]] ; CHECK: push [[REG1]] +; CHECK: ldi [[REG1:r[0-9]+]], 15 +; CHECK: ldi [[REG2:r[0-9]+]], 2 +; CHECK: push [[REG2]] +; CHECK: push [[REG1]] ; CHECK: call foo32_2 %result1 = call i32 @foo32_2(i32 1, i32 2, i32 3, i32 4, i32 34554432) ret i32 %result1 @@ -112,14 +112,15 @@ define i64 @calli64_reg() { define i64 @calli64_stack() { ; CHECK-LABEL: calli64_stack: -; CHECK: ldi [[REG1:r[0-9]+]], 31 -; CHECK: ldi [[REG2:r[0-9]+]], 242 -; CHECK: push [[REG2]] -; CHECK: push [[REG1]] + ; CHECK: ldi [[REG1:r[0-9]+]], 76 ; CHECK: ldi [[REG2:r[0-9]+]], 73 ; CHECK: push [[REG2]] ; CHECK: push [[REG1]] +; CHECK: ldi [[REG1:r[0-9]+]], 31 +; CHECK: ldi [[REG2:r[0-9]+]], 242 +; CHECK: push [[REG2]] +; CHECK: push [[REG1]] ; CHECK: ldi [[REG1:r[0-9]+]], 155 ; CHECK: ldi [[REG2:r[0-9]+]], 88 ; CHECK: push [[REG2]] diff --git a/test/CodeGen/AVR/directmem.ll b/test/CodeGen/AVR/directmem.ll index a97e712ed625..032263a9d657 100644 --- a/test/CodeGen/AVR/directmem.ll +++ b/test/CodeGen/AVR/directmem.ll @@ -33,10 +33,10 @@ define i8 @global8_load() { define void @array8_store() { ; CHECK-LABEL: array8_store: -; CHECK: ldi [[REG1:r[0-9]+]], 1 -; CHECK: sts char.array, [[REG1]] ; CHECK: ldi [[REG2:r[0-9]+]], 2 ; CHECK: sts char.array+1, [[REG2]] +; CHECK: ldi [[REG1:r[0-9]+]], 1 +; CHECK: sts char.array, [[REG1]] ; CHECK: ldi [[REG:r[0-9]+]], 3 ; CHECK: sts char.array+2, [[REG]] store i8 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @char.array, i32 0, i64 0) @@ -83,14 +83,18 @@ define i16 @global16_load() { define void @array16_store() { ; CHECK-LABEL: array16_store: -; CHECK: ldi [[REG1:r[0-9]+]], 187 -; CHECK: ldi [[REG2:r[0-9]+]], 170 -; CHECK: sts int.array+1, [[REG2]] -; CHECK: sts int.array, [[REG1]] + ; CHECK: ldi [[REG1:r[0-9]+]], 204 ; CHECK: ldi [[REG2:r[0-9]+]], 170 ; CHECK: sts int.array+3, [[REG2]] ; CHECK: sts int.array+2, [[REG1]] + +; CHECK: ldi [[REG1:r[0-9]+]], 187 +; CHECK: ldi [[REG2:r[0-9]+]], 170 +; CHECK: sts int.array+1, [[REG2]] +; CHECK: sts int.array, [[REG1]] + + ; CHECK: ldi [[REG1:r[0-9]+]], 221 ; CHECK: ldi [[REG2:r[0-9]+]], 170 ; CHECK: sts int.array+5, [[REG2]] @@ -148,14 +152,6 @@ define i32 @global32_load() { define void @array32_store() { ; CHECK-LABEL: array32_store: -; CHECK: ldi [[REG1:r[0-9]+]], 27 -; CHECK: ldi [[REG2:r[0-9]+]], 172 -; CHECK: sts long.array+3, [[REG2]] -; CHECK: sts long.array+2, [[REG1]] -; CHECK: ldi [[REG1:r[0-9]+]], 68 -; CHECK: ldi [[REG2:r[0-9]+]], 13 -; CHECK: sts long.array+1, [[REG2]] -; CHECK: sts long.array, [[REG1]] ; CHECK: ldi [[REG1:r[0-9]+]], 102 ; CHECK: ldi [[REG2:r[0-9]+]], 85 ; CHECK: sts long.array+7, [[REG2]] @@ -164,6 +160,14 @@ define void @array32_store() { ; CHECK: ldi [[REG2:r[0-9]+]], 119 ; CHECK: sts long.array+5, [[REG2]] ; CHECK: sts long.array+4, [[REG1]] +; CHECK: ldi [[REG1:r[0-9]+]], 27 +; CHECK: ldi [[REG2:r[0-9]+]], 172 +; CHECK: sts long.array+3, [[REG2]] +; CHECK: sts long.array+2, [[REG1]] +; CHECK: ldi [[REG1:r[0-9]+]], 68 +; CHECK: ldi [[REG2:r[0-9]+]], 13 +; CHECK: sts long.array+1, [[REG2]] +; CHECK: sts long.array, [[REG1]] ; CHECK: ldi [[REG1:r[0-9]+]], 170 ; CHECK: ldi [[REG2:r[0-9]+]], 153 ; CHECK: sts long.array+11, [[REG2]] diff --git a/test/CodeGen/AVR/inline-asm/multibyte.ll b/test/CodeGen/AVR/inline-asm/multibyte.ll deleted file mode 100644 index a7c8f6e75f0f..000000000000 --- a/test/CodeGen/AVR/inline-asm/multibyte.ll +++ /dev/null @@ -1,135 +0,0 @@ -; RUN: llc < %s -march=avr -no-integrated-as | FileCheck %s -; XFAIL: * - -; Multibyte references - -; CHECK-LABEL: multibyte_i16 -define void @multibyte_i16(i16 %a) { -entry: -; CHECK: instr r24 r25 - call void asm sideeffect "instr ${0:A} ${0:B}", "r"(i16 %a) -; CHECK: instr r25 r24 - call void asm sideeffect "instr ${0:B} ${0:A}", "r"(i16 %a) - ret void -} - -; CHECK-LABEL: multibyte_i32 -define void @multibyte_i32(i32 %a) { -entry: -; CHECK: instr r22 r23 r24 r25 - call void asm sideeffect "instr ${0:A} ${0:B} ${0:C} ${0:D}", "r"(i32 %a) -; CHECK: instr r25 r24 r23 r22 - call void asm sideeffect "instr ${0:D} ${0:C} ${0:B} ${0:A}", "r"(i32 %a) - ret void -} - -; CHECK-LABEL: multibyte_alternative_name -define void @multibyte_alternative_name(i16* %p) { -entry: -; CHECK: instr Z - call void asm sideeffect "instr ${0:a}", "e" (i16* %p) - ret void -} - -; CHECK-LABEL: multibyte_a_i32 -define void @multibyte_a_i32() { -entry: - %a = alloca i32 - %0 = load i32, i32* %a -; CHECK: instr r20 r21 r22 r23 - call void asm sideeffect "instr ${0:A} ${0:B} ${0:C} ${0:D}", "a"(i32 %0) - ret void -} - -@c = internal global i32 0 - -; CHECK-LABEL: multibyte_b_i32 -define void @multibyte_b_i32() { -entry: - %0 = load i32, i32* @c -; CHECK: instr r28 r29 r30 r31 - call void asm sideeffect "instr ${0:A} ${0:B} ${0:C} ${0:D}", "b"(i32 %0) - ret void -} - -; CHECK-LABEL: multibyte_d_i32 -define void @multibyte_d_i32() { -entry: - %a = alloca i32 - %0 = load i32, i32* %a -; CHECK: instr r18 r19 r24 r25 - call void asm sideeffect "instr ${0:A} ${0:B} ${0:C} ${0:D}", "d"(i32 %0) - ret void -} - -; CHECK-LABEL: multibyte_e_i32 -define void @multibyte_e_i32() { -entry: - %a = alloca i32 - %0 = load i32, i32* %a -; CHECK: instr r26 r27 r30 r31 - call void asm sideeffect "instr ${0:A} ${0:B} ${0:C} ${0:D}", "e"(i32 %0) - ret void -} - -; CHECK-LABEL: multibyte_l_i32 -define void @multibyte_l_i32() { -entry: - %a = alloca i32 - %0 = load i32, i32* %a -; CHECK: instr r12 r13 r14 r15 - call void asm sideeffect "instr ${0:A} ${0:B} ${0:C} ${0:D}", "l"(i32 %0) - ret void -} - -; CHECK-LABEL: multibyte_a_i16 -define void @multibyte_a_i16() { -entry: - %a = alloca i16 - %0 = load i16, i16* %a -; CHECK: instr r22 r23 - call void asm sideeffect "instr ${0:A} ${0:B}", "a"(i16 %0) - ret void -} - -; CHECK-LABEL: multibyte_b_i16 -define void @multibyte_b_i16() { -entry: - %a = alloca i16 - %0 = load i16, i16* %a -; CHECK: instr r30 r31 - call void asm sideeffect "instr ${0:A} ${0:B}", "b"(i16 %0) - ret void -} - -; CHECK-LABEL: multibyte_d_i16 -define void @multibyte_d_i16() { -entry: - %a = alloca i16 - %0 = load i16, i16* %a -; CHECK: instr r24 r25 - call void asm sideeffect "instr ${0:A} ${0:B}", "d"(i16 %0) - ret void -} - -; CHECK-LABEL: multibyte_e_i16 -define void @multibyte_e_i16() { -entry: - %a = alloca i16 - %0 = load i16, i16* %a -; CHECK: instr r30 r31 - call void asm sideeffect "instr ${0:A} ${0:B}", "e"(i16 %0) - ret void -} - -; CHECK-LABEL: multibyte_l_i16 -define void @multibyte_l_i16() { -entry: - %a = alloca i16 - %0 = load i16, i16* %a -; CHECK: instr r14 r15 - call void asm sideeffect "instr ${0:A} ${0:B}", "l"(i16 %0) - ret void -} - - diff --git a/test/CodeGen/AVR/varargs.ll b/test/CodeGen/AVR/varargs.ll index b35ce4c0f7ae..4959f2d880c8 100644 --- a/test/CodeGen/AVR/varargs.ll +++ b/test/CodeGen/AVR/varargs.ll @@ -40,14 +40,14 @@ define i16 @varargs2(i8* nocapture %x, ...) { declare void @var1223(i16, ...) define void @varargcall() { ; CHECK-LABEL: varargcall: -; CHECK: ldi [[REG1:r[0-9]+]], 191 -; CHECK: ldi [[REG2:r[0-9]+]], 223 -; CHECK: push [[REG2]] -; CHECK: push [[REG1]] ; CHECK: ldi [[REG1:r[0-9]+]], 189 ; CHECK: ldi [[REG2:r[0-9]+]], 205 ; CHECK: push [[REG2]] ; CHECK: push [[REG1]] +; CHECK: ldi [[REG1:r[0-9]+]], 191 +; CHECK: ldi [[REG2:r[0-9]+]], 223 +; CHECK: push [[REG2]] +; CHECK: push [[REG1]] ; CHECK: ldi [[REG1:r[0-9]+]], 205 ; CHECK: ldi [[REG2:r[0-9]+]], 171 ; CHECK: push [[REG2]] diff --git a/test/CodeGen/Hexagon/addrmode-globoff.mir b/test/CodeGen/Hexagon/addrmode-globoff.mir new file mode 100644 index 000000000000..fb22959751ac --- /dev/null +++ b/test/CodeGen/Hexagon/addrmode-globoff.mir @@ -0,0 +1,25 @@ +# RUN: llc -march=hexagon -run-pass amode-opt %s -o - | FileCheck %s + +--- | + @g0 = external global [16 x i16], align 8 + define void @foo() { + ret void + } +... + +--- +name: foo +tracksRegLiveness: true + +body: | + bb.0: + liveins: %r0 + + ; Make sure that the offset in @g0 is 8. + ; CHECK: S4_storerh_ur killed %r0, 2, @g0 + 8, %r0 + + %r1 = A2_tfrsi @g0+4 + %r2 = S2_addasl_rrri %r1, %r0, 2 + S2_storerh_io %r2, 4, %r0 +... + diff --git a/test/CodeGen/Mips/msa/shift_constant_pool.ll b/test/CodeGen/Mips/msa/shift_constant_pool.ll new file mode 100644 index 000000000000..73da33361bfa --- /dev/null +++ b/test/CodeGen/Mips/msa/shift_constant_pool.ll @@ -0,0 +1,171 @@ +; Test whether the following functions, with vectors featuring negative or values larger than the element +; bit size have their results of operations generated correctly when placed into constant pools + +; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64 %s +; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32 %s +; RUN: llc -march=mips64el -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64 %s +; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32 %s + +@llvm_mips_bclr_w_test_const_vec_res = global <4 x i32> zeroinitializer, align 16 + +define void @llvm_mips_bclr_w_test_const_vec() nounwind { +entry: + %0 = tail call <4 x i32> @llvm.mips.bclr.w(<4 x i32> , <4 x i32> ) + store <4 x i32> %0, <4 x i32>* @llvm_mips_bclr_w_test_const_vec_res + ret void +} + +declare <4 x i32> @llvm.mips.bclr.w(<4 x i32>, <4 x i32>) nounwind + +; MIPS32: [[LABEL:\$CPI[0-9]+_[0-9]+]]: +; MIPS64: [[LABEL:\.LCPI[0-9]+_[0-9]+]]: +; ALL: .4byte 1 # 0x1 +; ALL: .4byte 1 # 0x1 +; ALL: .4byte 3 # 0x3 +; ALL: .4byte 3 # 0x3 +; ALL-LABEL: llvm_mips_bclr_w_test_const_vec: +; MIPS32: lw $[[R2:[0-9]+]], %got([[LABEL]])($[[R1:[0-9]+]]) +; MIPS32: addiu $[[R2]], $[[R2]], %lo([[LABEL]]) +; MIPS32: lw $[[R3:[0-9]+]], %got(llvm_mips_bclr_w_test_const_vec_res)($[[R1]]) +; MIPS64: ld $[[R2:[0-9]+]], %got_page([[LABEL]])($[[R1:[0-9]+]]) +; MIPS64: daddiu $[[R2]], $[[R2]], %got_ofst([[LABEL]]) +; MIPS64: ld $[[R3:[0-9]+]], %got_disp(llvm_mips_bclr_w_test_const_vec_res)($[[R1]]) +; ALL: ld.w $w0, 0($[[R2]]) +; ALL: st.w $w0, 0($[[R3]]) + + +@llvm_mips_bneg_w_test_const_vec_res = global <4 x i32> zeroinitializer, align 16 + +define void @llvm_mips_bneg_w_test_const_vec() nounwind { +entry: + %0 = tail call <4 x i32> @llvm.mips.bneg.w(<4 x i32> , <4 x i32> ) + store <4 x i32> %0, <4 x i32>* @llvm_mips_bneg_w_test_const_vec_res + ret void +} + +declare <4 x i32> @llvm.mips.bneg.w(<4 x i32>, <4 x i32>) nounwind + +; MIPS32: [[LABEL:\$CPI[0-9]+_[0-9]+]]: +; MIPS64: [[LABEL:\.LCPI[0-9]+_[0-9]+]]: +; ALL: .4byte 1 # 0x1 +; ALL: .4byte 1 # 0x1 +; ALL: .4byte 3 # 0x3 +; ALL: .4byte 3 # 0x3 +; ALL-LABEL: llvm_mips_bneg_w_test_const_vec: +; MIPS32: lw $[[R2:[0-9]+]], %got([[LABEL]])($[[R1:[0-9]+]]) +; MIPS32: addiu $[[R2]], $[[R2]], %lo([[LABEL]]) +; MIPS32: lw $[[R3:[0-9]+]], %got(llvm_mips_bneg_w_test_const_vec_res)($[[R1]]) +; MIPS64: ld $[[R2:[0-9]+]], %got_page([[LABEL]])($[[R1:[0-9]+]]) +; MIPS64: daddiu $[[R2]], $[[R2]], %got_ofst([[LABEL]]) +; MIPS64: ld $[[R3:[0-9]+]], %got_disp(llvm_mips_bneg_w_test_const_vec_res)($[[R1]]) +; ALL: ld.w $w0, 0($[[R2]]) +; ALL: st.w $w0, 0($[[R3]]) + + +@llvm_mips_bset_w_test_const_vec_res = global <4 x i32> zeroinitializer, align 16 + +define void @llvm_mips_bset_w_test_const_vec() nounwind { +entry: + %0 = tail call <4 x i32> @llvm.mips.bset.w(<4 x i32> , <4 x i32> ) + store <4 x i32> %0, <4 x i32>* @llvm_mips_bset_w_test_const_vec_res + ret void +} + +declare <4 x i32> @llvm.mips.bset.w(<4 x i32>, <4 x i32>) nounwind + +; MIPS32: [[LABEL:\$CPI[0-9]+_[0-9]+]]: +; MIPS64: [[LABEL:\.LCPI[0-9]+_[0-9]+]]: +; ALL: .4byte 2147483648 # 0x80000000 +; ALL: .4byte 2147483648 # 0x80000000 +; ALL: .4byte 4 # 0x4 +; ALL: .4byte 4 # 0x4 +; ALL-LABEL: llvm_mips_bset_w_test_const_vec: +; MIPS32: lw $[[R2:[0-9]+]], %got([[LABEL]])($[[R1:[0-9]+]]) +; MIPS32: addiu $[[R2]], $[[R2]], %lo([[LABEL]]) +; MIPS32: lw $[[R3:[0-9]+]], %got(llvm_mips_bset_w_test_const_vec_res)($[[R1]]) +; MIPS64: ld $[[R2:[0-9]+]], %got_page([[LABEL]])($[[R1:[0-9]+]]) +; MIPS64: daddiu $[[R2]], $[[R2]], %got_ofst([[LABEL]]) +; MIPS64: ld $[[R3:[0-9]+]], %got_disp(llvm_mips_bset_w_test_const_vec_res)($[[R1]]) +; ALL: ld.w $w0, 0($[[R2]]) +; ALL: st.w $w0, 0($[[R3]]) + +@llvm_mips_sll_w_test_const_vec_res = global <4 x i32> zeroinitializer, align 16 + +define void @llvm_mips_sll_w_test_const_vec() nounwind { +entry: + %0 = tail call <4 x i32> @llvm.mips.sll.w(<4 x i32> , <4 x i32> ) + store <4 x i32> %0, <4 x i32>* @llvm_mips_sll_w_test_const_vec_res + ret void +} + +declare <4 x i32> @llvm.mips.sll.w(<4 x i32>, <4 x i32>) nounwind + +; MIPS32: [[LABEL:\$CPI[0-9]+_[0-9]+]]: +; MIPS64: [[LABEL:\.LCPI[0-9]+_[0-9]+]]: +; ALL: .4byte 2147483648 # 0x80000000 +; ALL: .4byte 2147483648 # 0x80000000 +; ALL: .4byte 4 # 0x4 +; ALL: .4byte 4 # 0x4 +; ALL-LABEL: llvm_mips_sll_w_test_const_vec: +; MIPS32: lw $[[R2:[0-9]+]], %got([[LABEL]])($[[R1:[0-9]+]]) +; MIPS32: addiu $[[R2]], $[[R2]], %lo([[LABEL]]) +; MIPS32: lw $[[R3:[0-9]+]], %got(llvm_mips_sll_w_test_const_vec_res)($[[R1]]) +; MIPS64: ld $[[R2:[0-9]+]], %got_page([[LABEL]])($[[R1:[0-9]+]]) +; MIPS64: daddiu $[[R2]], $[[R2]], %got_ofst([[LABEL]]) +; MIPS64: ld $[[R3:[0-9]+]], %got_disp(llvm_mips_sll_w_test_const_vec_res)($[[R1]]) +; ALL: ld.w $w0, 0($[[R2]]) +; ALL: st.w $w0, 0($[[R3]]) + +@llvm_mips_sra_w_test_const_vec_res = global <4 x i32> zeroinitializer, align 16 + +define void @llvm_mips_sra_w_test_const_vec() nounwind { +entry: + %0 = tail call <4 x i32> @llvm.mips.sra.w(<4 x i32> , <4 x i32> ) + store <4 x i32> %0, <4 x i32>* @llvm_mips_sra_w_test_const_vec_res + ret void +} + +declare <4 x i32> @llvm.mips.sra.w(<4 x i32>, <4 x i32>) nounwind + +; MIPS32: [[LABEL:\$CPI[0-9]+_[0-9]+]]: +; MIPS64: [[LABEL:\.LCPI[0-9]+_[0-9]+]]: +; ALL: .4byte 4294967292 # 0xfffffffc +; ALL: .4byte 4 # 0x4 +; ALL: .4byte 8 # 0x8 +; ALL: .4byte 8 # 0x8 +; ALL-LABEL: llvm_mips_sra_w_test_const_vec: +; MIPS32: lw $[[R2:[0-9]+]], %got([[LABEL]])($[[R1:[0-9]+]]) +; MIPS32: addiu $[[R2]], $[[R2]], %lo([[LABEL]]) +; MIPS32: lw $[[R3:[0-9]+]], %got(llvm_mips_sra_w_test_const_vec_res)($[[R1]]) +; MIPS64: ld $[[R2:[0-9]+]], %got_page([[LABEL]])($[[R1:[0-9]+]]) +; MIPS64: daddiu $[[R2]], $[[R2]], %got_ofst([[LABEL]]) +; MIPS64: ld $[[R3:[0-9]+]], %got_disp(llvm_mips_sra_w_test_const_vec_res)($[[R1]]) +; ALL: ld.w $w0, 0($[[R2]]) +; ALL: st.w $w0, 0($[[R3]]) + +@llvm_mips_srl_w_test_const_vec_res = global <4 x i32> zeroinitializer, align 16 + +define void @llvm_mips_srl_w_test_const_vec() nounwind { +entry: + %0 = tail call <4 x i32> @llvm.mips.srl.w(<4 x i32> , <4 x i32> ) + store <4 x i32> %0, <4 x i32>* @llvm_mips_srl_w_test_const_vec_res + ret void +} + +declare <4 x i32> @llvm.mips.srl.w(<4 x i32>, <4 x i32>) nounwind + +; MIPS32: [[LABEL:\$CPI[0-9]+_[0-9]+]]: +; MIPS64: [[LABEL:\.LCPI[0-9]+_[0-9]+]]: +; ALL: .4byte 1073741820 # 0x3ffffffc +; ALL: .4byte 4 # 0x4 +; ALL: .4byte 8 # 0x8 +; ALL: .4byte 8 # 0x8 +; ALL-LABEL: llvm_mips_srl_w_test_const_vec: +; MIPS32: lw $[[R2:[0-9]+]], %got([[LABEL]])($[[R1:[0-9]+]]) +; MIPS32: addiu $[[R2]], $[[R2]], %lo([[LABEL]]) +; MIPS32: lw $[[R3:[0-9]+]], %got(llvm_mips_srl_w_test_const_vec_res)($[[R1]]) +; MIPS64: ld $[[R2:[0-9]+]], %got_page([[LABEL]])($[[R1:[0-9]+]]) +; MIPS64: daddiu $[[R2]], $[[R2]], %got_ofst([[LABEL]]) +; MIPS64: ld $[[R3:[0-9]+]], %got_disp(llvm_mips_srl_w_test_const_vec_res)($[[R1]]) +; ALL: ld.w $w0, 0($[[R2]]) +; ALL: st.w $w0, 0($[[R3]]) diff --git a/test/CodeGen/Mips/msa/shift_no_and.ll b/test/CodeGen/Mips/msa/shift_no_and.ll new file mode 100644 index 000000000000..c6f90215af9c --- /dev/null +++ b/test/CodeGen/Mips/msa/shift_no_and.ll @@ -0,0 +1,460 @@ +; Test the absence of the andi.b / and.v instructions + +; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s +; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s + +@llvm_mips_bclr_b_ARG1 = global <16 x i8> , align 16 +@llvm_mips_bclr_b_ARG2 = global <16 x i8> , align 16 +@llvm_mips_bclr_b_RES = global <16 x i8> , align 16 + +define void @llvm_mips_bclr_b_test() nounwind { +entry: + %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bclr_b_ARG1 + %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bclr_b_ARG2 + %2 = tail call <16 x i8> @llvm.mips.bclr.b(<16 x i8> %0, <16 x i8> %1) + store <16 x i8> %2, <16 x i8>* @llvm_mips_bclr_b_RES + ret void +} + +declare <16 x i8> @llvm.mips.bclr.b(<16 x i8>, <16 x i8>) nounwind + +; CHECK-LABEL: llvm_mips_bclr_b_test: +; CHECK-NOT: andi.b +; CHECK: bclr.b + +@llvm_mips_bclr_h_ARG1 = global <8 x i16> , align 16 +@llvm_mips_bclr_h_ARG2 = global <8 x i16> , align 16 +@llvm_mips_bclr_h_RES = global <8 x i16> , align 16 + +define void @llvm_mips_bclr_h_test() nounwind { +entry: + %0 = load <8 x i16>, <8 x i16>* @llvm_mips_bclr_h_ARG1 + %1 = load <8 x i16>, <8 x i16>* @llvm_mips_bclr_h_ARG2 + %2 = tail call <8 x i16> @llvm.mips.bclr.h(<8 x i16> %0, <8 x i16> %1) + store <8 x i16> %2, <8 x i16>* @llvm_mips_bclr_h_RES + ret void +} + +declare <8 x i16> @llvm.mips.bclr.h(<8 x i16>, <8 x i16>) nounwind + +; CHECK-LABEL: llvm_mips_bclr_h_test: +; CHECK-NOT: and.v +; CHECK: bclr.h + +@llvm_mips_bclr_w_ARG1 = global <4 x i32> , align 16 +@llvm_mips_bclr_w_ARG2 = global <4 x i32> , align 16 +@llvm_mips_bclr_w_RES = global <4 x i32> , align 16 + +define void @llvm_mips_bclr_w_test() nounwind { +entry: + %0 = load <4 x i32>, <4 x i32>* @llvm_mips_bclr_w_ARG1 + %1 = load <4 x i32>, <4 x i32>* @llvm_mips_bclr_w_ARG2 + %2 = tail call <4 x i32> @llvm.mips.bclr.w(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %2, <4 x i32>* @llvm_mips_bclr_w_RES + ret void +} + +declare <4 x i32> @llvm.mips.bclr.w(<4 x i32>, <4 x i32>) nounwind + +; CHECK-LABEL: llvm_mips_bclr_w_test: +; CHECK-NOT: and.v +; CHECK: bclr.w + +@llvm_mips_bclr_d_ARG1 = global <2 x i64> , align 16 +@llvm_mips_bclr_d_ARG2 = global <2 x i64> , align 16 +@llvm_mips_bclr_d_RES = global <2 x i64> , align 16 + +define void @llvm_mips_bclr_d_test() nounwind { +entry: + %0 = load <2 x i64>, <2 x i64>* @llvm_mips_bclr_d_ARG1 + %1 = load <2 x i64>, <2 x i64>* @llvm_mips_bclr_d_ARG2 + %2 = tail call <2 x i64> @llvm.mips.bclr.d(<2 x i64> %0, <2 x i64> %1) + store <2 x i64> %2, <2 x i64>* @llvm_mips_bclr_d_RES + ret void +} + +declare <2 x i64> @llvm.mips.bclr.d(<2 x i64>, <2 x i64>) nounwind + +; CHECK-LABEL: llvm_mips_bclr_d_test: +; CHECK-NOT: and.v +; CHECK: bclr.d + +@llvm_mips_bneg_b_ARG1 = global <16 x i8> , align 16 +@llvm_mips_bneg_b_ARG2 = global <16 x i8> , align 16 +@llvm_mips_bneg_b_RES = global <16 x i8> , align 16 + +define void @llvm_mips_bneg_b_test() nounwind { +entry: + %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bneg_b_ARG1 + %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bneg_b_ARG2 + %2 = tail call <16 x i8> @llvm.mips.bneg.b(<16 x i8> %0, <16 x i8> %1) + store <16 x i8> %2, <16 x i8>* @llvm_mips_bneg_b_RES + ret void +} + +declare <16 x i8> @llvm.mips.bneg.b(<16 x i8>, <16 x i8>) nounwind + +; CHECK-LABEL: llvm_mips_bneg_b_test: +; CHECK-NOT: andi.b +; CHECK: bneg.b + +@llvm_mips_bneg_h_ARG1 = global <8 x i16> , align 16 +@llvm_mips_bneg_h_ARG2 = global <8 x i16> , align 16 +@llvm_mips_bneg_h_RES = global <8 x i16> , align 16 + +define void @llvm_mips_bneg_h_test() nounwind { +entry: + %0 = load <8 x i16>, <8 x i16>* @llvm_mips_bneg_h_ARG1 + %1 = load <8 x i16>, <8 x i16>* @llvm_mips_bneg_h_ARG2 + %2 = tail call <8 x i16> @llvm.mips.bneg.h(<8 x i16> %0, <8 x i16> %1) + store <8 x i16> %2, <8 x i16>* @llvm_mips_bneg_h_RES + ret void +} + +declare <8 x i16> @llvm.mips.bneg.h(<8 x i16>, <8 x i16>) nounwind + +; CHECK-LABEL: llvm_mips_bneg_h_test: +; CHECK-NOT: and.v +; CHECK: bneg.h + +@llvm_mips_bneg_w_ARG1 = global <4 x i32> , align 16 +@llvm_mips_bneg_w_ARG2 = global <4 x i32> , align 16 +@llvm_mips_bneg_w_RES = global <4 x i32> , align 16 + +define void @llvm_mips_bneg_w_test() nounwind { +entry: + %0 = load <4 x i32>, <4 x i32>* @llvm_mips_bneg_w_ARG1 + %1 = load <4 x i32>, <4 x i32>* @llvm_mips_bneg_w_ARG2 + %2 = tail call <4 x i32> @llvm.mips.bneg.w(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %2, <4 x i32>* @llvm_mips_bneg_w_RES + ret void +} + +declare <4 x i32> @llvm.mips.bneg.w(<4 x i32>, <4 x i32>) nounwind + +; CHECK-LABEL: llvm_mips_bneg_w_test: +; CHECK-NOT: and.v +; CHECK: bneg.w + +@llvm_mips_bneg_d_ARG1 = global <2 x i64> , align 16 +@llvm_mips_bneg_d_ARG2 = global <2 x i64> , align 16 +@llvm_mips_bneg_d_RES = global <2 x i64> , align 16 + +define void @llvm_mips_bneg_d_test() nounwind { +entry: + %0 = load <2 x i64>, <2 x i64>* @llvm_mips_bneg_d_ARG1 + %1 = load <2 x i64>, <2 x i64>* @llvm_mips_bneg_d_ARG2 + %2 = tail call <2 x i64> @llvm.mips.bneg.d(<2 x i64> %0, <2 x i64> %1) + store <2 x i64> %2, <2 x i64>* @llvm_mips_bneg_d_RES + ret void +} + +declare <2 x i64> @llvm.mips.bneg.d(<2 x i64>, <2 x i64>) nounwind + +; CHECK-LABEL: llvm_mips_bneg_d_test: +; CHECK-NOT: and.v +; CHECK: bneg.d + +@llvm_mips_bset_b_ARG1 = global <16 x i8> , align 16 +@llvm_mips_bset_b_ARG2 = global <16 x i8> , align 16 +@llvm_mips_bset_b_RES = global <16 x i8> , align 16 + +define void @llvm_mips_bset_b_test() nounwind { +entry: + %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bset_b_ARG1 + %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bset_b_ARG2 + %2 = tail call <16 x i8> @llvm.mips.bset.b(<16 x i8> %0, <16 x i8> %1) + store <16 x i8> %2, <16 x i8>* @llvm_mips_bset_b_RES + ret void +} + +declare <16 x i8> @llvm.mips.bset.b(<16 x i8>, <16 x i8>) nounwind + +; CHECK-LABEL: llvm_mips_bset_b_test: +; CHECK-NOT: andi.b +; CHECK: bset.b + +@llvm_mips_bset_h_ARG1 = global <8 x i16> , align 16 +@llvm_mips_bset_h_ARG2 = global <8 x i16> , align 16 +@llvm_mips_bset_h_RES = global <8 x i16> , align 16 + +define void @llvm_mips_bset_h_test() nounwind { +entry: + %0 = load <8 x i16>, <8 x i16>* @llvm_mips_bset_h_ARG1 + %1 = load <8 x i16>, <8 x i16>* @llvm_mips_bset_h_ARG2 + %2 = tail call <8 x i16> @llvm.mips.bset.h(<8 x i16> %0, <8 x i16> %1) + store <8 x i16> %2, <8 x i16>* @llvm_mips_bset_h_RES + ret void +} + +declare <8 x i16> @llvm.mips.bset.h(<8 x i16>, <8 x i16>) nounwind + +; CHECK-LABEL: llvm_mips_bset_h_test: +; CHECK-NOT: and.v +; CHECK: bset.h + +@llvm_mips_bset_w_ARG1 = global <4 x i32> , align 16 +@llvm_mips_bset_w_ARG2 = global <4 x i32> , align 16 +@llvm_mips_bset_w_RES = global <4 x i32> , align 16 + +define void @llvm_mips_bset_w_test() nounwind { +entry: + %0 = load <4 x i32>, <4 x i32>* @llvm_mips_bset_w_ARG1 + %1 = load <4 x i32>, <4 x i32>* @llvm_mips_bset_w_ARG2 + %2 = tail call <4 x i32> @llvm.mips.bset.w(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %2, <4 x i32>* @llvm_mips_bset_w_RES + ret void +} + +declare <4 x i32> @llvm.mips.bset.w(<4 x i32>, <4 x i32>) nounwind + +; CHECK-LABEL: llvm_mips_bset_w_test: +; CHECK-NOT: and.v +; CHECK: bset.w + +@llvm_mips_bset_d_ARG1 = global <2 x i64> , align 16 +@llvm_mips_bset_d_ARG2 = global <2 x i64> , align 16 +@llvm_mips_bset_d_RES = global <2 x i64> , align 16 + +define void @llvm_mips_bset_d_test() nounwind { +entry: + %0 = load <2 x i64>, <2 x i64>* @llvm_mips_bset_d_ARG1 + %1 = load <2 x i64>, <2 x i64>* @llvm_mips_bset_d_ARG2 + %2 = tail call <2 x i64> @llvm.mips.bset.d(<2 x i64> %0, <2 x i64> %1) + store <2 x i64> %2, <2 x i64>* @llvm_mips_bset_d_RES + ret void +} + +declare <2 x i64> @llvm.mips.bset.d(<2 x i64>, <2 x i64>) nounwind + +; CHECK-LABEL: llvm_mips_bset_d_test: +; CHECK-NOT: and.v +; CHECK: bset.d + +@llvm_mips_sll_b_ARG1 = global <16 x i8> , align 16 +@llvm_mips_sll_b_ARG2 = global <16 x i8> , align 16 +@llvm_mips_sll_b_RES = global <16 x i8> , align 16 + +define void @llvm_mips_sll_b_test() nounwind { +entry: + %0 = load <16 x i8>, <16 x i8>* @llvm_mips_sll_b_ARG1 + %1 = load <16 x i8>, <16 x i8>* @llvm_mips_sll_b_ARG2 + %2 = tail call <16 x i8> @llvm.mips.sll.b(<16 x i8> %0, <16 x i8> %1) + store <16 x i8> %2, <16 x i8>* @llvm_mips_sll_b_RES + ret void +} + +declare <16 x i8> @llvm.mips.sll.b(<16 x i8>, <16 x i8>) nounwind + +; CHECK-LABEL: llvm_mips_sll_b_test: +; CHECK-NOT: andi.b +; CHECK: sll.b + +@llvm_mips_sll_h_ARG1 = global <8 x i16> , align 16 +@llvm_mips_sll_h_ARG2 = global <8 x i16> , align 16 +@llvm_mips_sll_h_RES = global <8 x i16> , align 16 + +define void @llvm_mips_sll_h_test() nounwind { +entry: + %0 = load <8 x i16>, <8 x i16>* @llvm_mips_sll_h_ARG1 + %1 = load <8 x i16>, <8 x i16>* @llvm_mips_sll_h_ARG2 + %2 = tail call <8 x i16> @llvm.mips.sll.h(<8 x i16> %0, <8 x i16> %1) + store <8 x i16> %2, <8 x i16>* @llvm_mips_sll_h_RES + ret void +} + +declare <8 x i16> @llvm.mips.sll.h(<8 x i16>, <8 x i16>) nounwind + +; CHECK-LABEL: llvm_mips_sll_h_test: +; CHECK-NOT: and.v +; CHECK: sll.h + +@llvm_mips_sll_w_ARG1 = global <4 x i32> , align 16 +@llvm_mips_sll_w_ARG2 = global <4 x i32> , align 16 +@llvm_mips_sll_w_RES = global <4 x i32> , align 16 + +define void @llvm_mips_sll_w_test() nounwind { +entry: + %0 = load <4 x i32>, <4 x i32>* @llvm_mips_sll_w_ARG1 + %1 = load <4 x i32>, <4 x i32>* @llvm_mips_sll_w_ARG2 + %2 = tail call <4 x i32> @llvm.mips.sll.w(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %2, <4 x i32>* @llvm_mips_sll_w_RES + ret void +} + +declare <4 x i32> @llvm.mips.sll.w(<4 x i32>, <4 x i32>) nounwind + +; CHECK-LABEL: llvm_mips_sll_w_test: +; CHECK-NOT: and.v +; CHECK: sll.w + +@llvm_mips_sll_d_ARG1 = global <2 x i64> , align 16 +@llvm_mips_sll_d_ARG2 = global <2 x i64> , align 16 +@llvm_mips_sll_d_RES = global <2 x i64> , align 16 + +define void @llvm_mips_sll_d_test() nounwind { +entry: + %0 = load <2 x i64>, <2 x i64>* @llvm_mips_sll_d_ARG1 + %1 = load <2 x i64>, <2 x i64>* @llvm_mips_sll_d_ARG2 + %2 = tail call <2 x i64> @llvm.mips.sll.d(<2 x i64> %0, <2 x i64> %1) + store <2 x i64> %2, <2 x i64>* @llvm_mips_sll_d_RES + ret void +} + +declare <2 x i64> @llvm.mips.sll.d(<2 x i64>, <2 x i64>) nounwind + +; CHECK-LABEL: llvm_mips_sll_d_test: +; CHECK-NOT: and.v +; CHECK: sll.d + +@llvm_mips_sra_b_ARG1 = global <16 x i8> , align 16 +@llvm_mips_sra_b_ARG2 = global <16 x i8> , align 16 +@llvm_mips_sra_b_RES = global <16 x i8> , align 16 + +define void @llvm_mips_sra_b_test() nounwind { +entry: + %0 = load <16 x i8>, <16 x i8>* @llvm_mips_sra_b_ARG1 + %1 = load <16 x i8>, <16 x i8>* @llvm_mips_sra_b_ARG2 + %2 = tail call <16 x i8> @llvm.mips.sra.b(<16 x i8> %0, <16 x i8> %1) + store <16 x i8> %2, <16 x i8>* @llvm_mips_sra_b_RES + ret void +} + +declare <16 x i8> @llvm.mips.sra.b(<16 x i8>, <16 x i8>) nounwind + +; CHECK-LABEL: llvm_mips_sra_b_test: +; CHECK-NOT: andi.b +; CHECK: sra.b + +@llvm_mips_sra_h_ARG1 = global <8 x i16> , align 16 +@llvm_mips_sra_h_ARG2 = global <8 x i16> , align 16 +@llvm_mips_sra_h_RES = global <8 x i16> , align 16 + +define void @llvm_mips_sra_h_test() nounwind { +entry: + %0 = load <8 x i16>, <8 x i16>* @llvm_mips_sra_h_ARG1 + %1 = load <8 x i16>, <8 x i16>* @llvm_mips_sra_h_ARG2 + %2 = tail call <8 x i16> @llvm.mips.sra.h(<8 x i16> %0, <8 x i16> %1) + store <8 x i16> %2, <8 x i16>* @llvm_mips_sra_h_RES + ret void +} + +declare <8 x i16> @llvm.mips.sra.h(<8 x i16>, <8 x i16>) nounwind + +; CHECK-LABEL: llvm_mips_sra_h_test: +; CHECK-NOT: and.v +; CHECK: sra.h + +@llvm_mips_sra_w_ARG1 = global <4 x i32> , align 16 +@llvm_mips_sra_w_ARG2 = global <4 x i32> , align 16 +@llvm_mips_sra_w_RES = global <4 x i32> , align 16 + +define void @llvm_mips_sra_w_test() nounwind { +entry: + %0 = load <4 x i32>, <4 x i32>* @llvm_mips_sra_w_ARG1 + %1 = load <4 x i32>, <4 x i32>* @llvm_mips_sra_w_ARG2 + %2 = tail call <4 x i32> @llvm.mips.sra.w(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %2, <4 x i32>* @llvm_mips_sra_w_RES + ret void +} + +declare <4 x i32> @llvm.mips.sra.w(<4 x i32>, <4 x i32>) nounwind + +; CHECK-LABEL: llvm_mips_sra_w_test: +; CHECK-NOT: and.v +; CHECK: sra.w + +@llvm_mips_sra_d_ARG1 = global <2 x i64> , align 16 +@llvm_mips_sra_d_ARG2 = global <2 x i64> , align 16 +@llvm_mips_sra_d_RES = global <2 x i64> , align 16 + +define void @llvm_mips_sra_d_test() nounwind { +entry: + %0 = load <2 x i64>, <2 x i64>* @llvm_mips_sra_d_ARG1 + %1 = load <2 x i64>, <2 x i64>* @llvm_mips_sra_d_ARG2 + %2 = tail call <2 x i64> @llvm.mips.sra.d(<2 x i64> %0, <2 x i64> %1) + store <2 x i64> %2, <2 x i64>* @llvm_mips_sra_d_RES + ret void +} + +declare <2 x i64> @llvm.mips.sra.d(<2 x i64>, <2 x i64>) nounwind + +; CHECK-LABEL: llvm_mips_sra_d_test: +; CHECK-NOT: and.v +; CHECK: sra.d + +@llvm_mips_srl_b_ARG1 = global <16 x i8> , align 16 +@llvm_mips_srl_b_ARG2 = global <16 x i8> , align 16 +@llvm_mips_srl_b_RES = global <16 x i8> , align 16 + +define void @llvm_mips_srl_b_test() nounwind { +entry: + %0 = load <16 x i8>, <16 x i8>* @llvm_mips_srl_b_ARG1 + %1 = load <16 x i8>, <16 x i8>* @llvm_mips_srl_b_ARG2 + %2 = tail call <16 x i8> @llvm.mips.srl.b(<16 x i8> %0, <16 x i8> %1) + store <16 x i8> %2, <16 x i8>* @llvm_mips_srl_b_RES + ret void +} + +declare <16 x i8> @llvm.mips.srl.b(<16 x i8>, <16 x i8>) nounwind + +; CHECK-LABEL: llvm_mips_srl_b_test: +; CHECK-NOT: andi.b +; CHECK: srl.b + +@llvm_mips_srl_h_ARG1 = global <8 x i16> , align 16 +@llvm_mips_srl_h_ARG2 = global <8 x i16> , align 16 +@llvm_mips_srl_h_RES = global <8 x i16> , align 16 + +define void @llvm_mips_srl_h_test() nounwind { +entry: + %0 = load <8 x i16>, <8 x i16>* @llvm_mips_srl_h_ARG1 + %1 = load <8 x i16>, <8 x i16>* @llvm_mips_srl_h_ARG2 + %2 = tail call <8 x i16> @llvm.mips.srl.h(<8 x i16> %0, <8 x i16> %1) + store <8 x i16> %2, <8 x i16>* @llvm_mips_srl_h_RES + ret void +} + +declare <8 x i16> @llvm.mips.srl.h(<8 x i16>, <8 x i16>) nounwind + +; CHECK-LABEL: llvm_mips_srl_h_test: +; CHECK-NOT: and.v +; CHECK: srl.h + +@llvm_mips_srl_w_ARG1 = global <4 x i32> , align 16 +@llvm_mips_srl_w_ARG2 = global <4 x i32> , align 16 +@llvm_mips_srl_w_RES = global <4 x i32> , align 16 + +define void @llvm_mips_srl_w_test() nounwind { +entry: + %0 = load <4 x i32>, <4 x i32>* @llvm_mips_srl_w_ARG1 + %1 = load <4 x i32>, <4 x i32>* @llvm_mips_srl_w_ARG2 + %2 = tail call <4 x i32> @llvm.mips.srl.w(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %2, <4 x i32>* @llvm_mips_srl_w_RES + ret void +} + +declare <4 x i32> @llvm.mips.srl.w(<4 x i32>, <4 x i32>) nounwind + +; CHECK-LABEL: llvm_mips_srl_w_test: +; CHECK-NOT: and.v +; CHECK: srl.w + +@llvm_mips_srl_d_ARG1 = global <2 x i64> , align 16 +@llvm_mips_srl_d_ARG2 = global <2 x i64> , align 16 +@llvm_mips_srl_d_RES = global <2 x i64> , align 16 + +define void @llvm_mips_srl_d_test() nounwind { +entry: + %0 = load <2 x i64>, <2 x i64>* @llvm_mips_srl_d_ARG1 + %1 = load <2 x i64>, <2 x i64>* @llvm_mips_srl_d_ARG2 + %2 = tail call <2 x i64> @llvm.mips.srl.d(<2 x i64> %0, <2 x i64> %1) + store <2 x i64> %2, <2 x i64>* @llvm_mips_srl_d_RES + ret void +} + +declare <2 x i64> @llvm.mips.srl.d(<2 x i64>, <2 x i64>) nounwind + +; CHECK-LABEL: llvm_mips_srl_d_test: +; CHECK-NOT: and.v +; CHECK: srl.d diff --git a/test/CodeGen/PowerPC/andc.ll b/test/CodeGen/PowerPC/andc.ll index 6135db510ad5..df47bfc1e38e 100644 --- a/test/CodeGen/PowerPC/andc.ll +++ b/test/CodeGen/PowerPC/andc.ll @@ -1,12 +1,13 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-apple-darwin | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-unknown | FileCheck %s define i1 @and_cmp1(i32 %x, i32 %y) { ; CHECK-LABEL: and_cmp1: -; CHECK: andc [[REG1:r[0-9]+]], r4, r3 -; CHECK: cntlzw [[REG2:r[0-9]+]], [[REG1]] -; CHECK: rlwinm r3, [[REG2]], 27, 31, 31 -; CHECK: blr - +; CHECK: # BB#0: +; CHECK-NEXT: andc 3, 4, 3 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: blr %and = and i32 %x, %y %cmp = icmp eq i32 %and, %y ret i1 %cmp @@ -14,12 +15,12 @@ define i1 @and_cmp1(i32 %x, i32 %y) { define i1 @and_cmp_const(i32 %x) { ; CHECK-LABEL: and_cmp_const: -; CHECK: li [[REG1:r[0-9]+]], 43 -; CHECK: andc [[REG2:r[0-9]+]], [[REG1]], r3 -; CHECK: cntlzw [[REG3:r[0-9]+]], [[REG2]] -; CHECK: rlwinm r3, [[REG3]], 27, 31, 31 -; CHECK: blr - +; CHECK: # BB#0: +; CHECK-NEXT: li 4, 43 +; CHECK-NEXT: andc 3, 4, 3 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: blr %and = and i32 %x, 43 %cmp = icmp eq i32 %and, 43 ret i1 %cmp @@ -27,15 +28,26 @@ define i1 @and_cmp_const(i32 %x) { define i1 @foo(i32 %i) { ; CHECK-LABEL: foo: -; CHECK: lis [[REG1:r[0-9]+]], 4660 -; CHECK: ori [[REG2:r[0-9]+]], [[REG1]], 22136 -; CHECK: andc [[REG3:r[0-9]+]], [[REG2]], r3 -; CHECK: cntlzw [[REG4:r[0-9]+]], [[REG3]] -; CHECK: rlwinm r3, [[REG4]], 27, 31, 31 -; CHECK: blr - +; CHECK: # BB#0: +; CHECK-NEXT: lis 4, 4660 +; CHECK-NEXT: ori 4, 4, 22136 +; CHECK-NEXT: andc 3, 4, 3 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: blr %and = and i32 %i, 305419896 %cmp = icmp eq i32 %and, 305419896 ret i1 %cmp } +define <4 x i32> @hidden_not_v4i32(<4 x i32> %x) { +; CHECK-LABEL: hidden_not_v4i32: +; CHECK: # BB#0: +; CHECK-NEXT: vspltisw 3, 6 +; CHECK-NEXT: xxlandc 34, 35, 34 +; CHECK-NEXT: blr + %xor = xor <4 x i32> %x, + %and = and <4 x i32> %xor, + ret <4 x i32> %and +} + diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll index b059fd8a5987..dfd3fad794f1 100644 --- a/test/CodeGen/WebAssembly/returned.ll +++ b/test/CodeGen/WebAssembly/returned.ll @@ -47,3 +47,34 @@ define void @test_constant_arg() { ret void } declare i32* @returns_arg(i32* returned) + +; Test that the optimization isn't performed on arguments without the +; "returned" attribute. + +; CHECK-LABEL: test_other_skipped: +; CHECK-NEXT: .param i32, i32, f64{{$}} +; CHECK-NEXT: {{^}} i32.call $drop=, do_something@FUNCTION, $0, $1, $2{{$}} +; CHECK-NEXT: {{^}} call do_something_with_i32@FUNCTION, $1{{$}} +; CHECK-NEXT: {{^}} call do_something_with_double@FUNCTION, $2{{$}} +declare i32 @do_something(i32 returned, i32, double) +declare void @do_something_with_i32(i32) +declare void @do_something_with_double(double) +define void @test_other_skipped(i32 %a, i32 %b, double %c) { + %call = call i32 @do_something(i32 %a, i32 %b, double %c) + call void @do_something_with_i32(i32 %b) + call void @do_something_with_double(double %c) + ret void +} + +; Test that the optimization is performed on arguments other than the first. + +; CHECK-LABEL: test_second_arg: +; CHECK-NEXT: .param i32, i32{{$}} +; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: {{^}} i32.call $push0=, do_something_else@FUNCTION, $0, $1{{$}} +; CHECK-NEXT: return $pop0{{$}} +declare i32 @do_something_else(i32, i32 returned) +define i32 @test_second_arg(i32 %a, i32 %b) { + %call = call i32 @do_something_else(i32 %a, i32 %b) + ret i32 %b +} diff --git a/test/CodeGen/X86/GlobalISel/X86-regbankselect.mir b/test/CodeGen/X86/GlobalISel/X86-regbankselect.mir index c4e5fb2d05fc..8e04239041a8 100644 --- a/test/CodeGen/X86/GlobalISel/X86-regbankselect.mir +++ b/test/CodeGen/X86/GlobalISel/X86-regbankselect.mir @@ -106,6 +106,10 @@ ret void } + define void @trunc_check() { + ret void + } + ... --- name: test_add_i8 @@ -632,3 +636,27 @@ body: | RET 0 ... +--- +name: trunc_check +alignment: 4 +legalized: true +# CHECK-LABEL: name: trunc_check +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +# CHECK-NEXT: - { id: 3, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0 (%ir-block.0): + %0(s32) = IMPLICIT_DEF + %1(s1) = G_TRUNC %0(s32) + %2(s8) = G_TRUNC %0(s32) + %3(s16) = G_TRUNC %0(s32) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/binop-isel.ll b/test/CodeGen/X86/GlobalISel/binop-isel.ll deleted file mode 100644 index 8499dd958447..000000000000 --- a/test/CodeGen/X86/GlobalISel/binop-isel.ll +++ /dev/null @@ -1,186 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL - -define i64 @test_add_i64(i64 %arg1, i64 %arg2) { -; ALL-LABEL: test_add_i64: -; ALL: # BB#0: -; ALL-NEXT: leaq (%rsi,%rdi), %rax -; ALL-NEXT: retq - %ret = add i64 %arg1, %arg2 - ret i64 %ret -} - -define i32 @test_add_i32(i32 %arg1, i32 %arg2) { -; ALL-LABEL: test_add_i32: -; ALL: # BB#0: -; ALL-NEXT: # kill: %EDI %EDI %RDI -; ALL-NEXT: # kill: %ESI %ESI %RSI -; ALL-NEXT: leal (%rsi,%rdi), %eax -; ALL-NEXT: retq - %ret = add i32 %arg1, %arg2 - ret i32 %ret -} - -define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { -; ALL-LABEL: test_sub_i64: -; ALL: # BB#0: -; ALL-NEXT: subq %rsi, %rdi -; ALL-NEXT: movq %rdi, %rax -; ALL-NEXT: retq - %ret = sub i64 %arg1, %arg2 - ret i64 %ret -} - -define i32 @test_sub_i32(i32 %arg1, i32 %arg2) { -; ALL-LABEL: test_sub_i32: -; ALL: # BB#0: -; ALL-NEXT: subl %esi, %edi -; ALL-NEXT: movl %edi, %eax -; ALL-NEXT: retq - %ret = sub i32 %arg1, %arg2 - ret i32 %ret -} - -define float @test_add_float(float %arg1, float %arg2) { -; SSE-LABEL: test_add_float: -; SSE: # BB#0: -; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_add_float: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = fadd float %arg1, %arg2 - ret float %ret -} - -define double @test_add_double(double %arg1, double %arg2) { -; SSE-LABEL: test_add_double: -; SSE: # BB#0: -; SSE-NEXT: addsd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_add_double: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = fadd double %arg1, %arg2 - ret double %ret -} - -define float @test_sub_float(float %arg1, float %arg2) { -; SSE-LABEL: test_sub_float: -; SSE: # BB#0: -; SSE-NEXT: subss %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_sub_float: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = fsub float %arg1, %arg2 - ret float %ret -} - -define double @test_sub_double(double %arg1, double %arg2) { -; SSE-LABEL: test_sub_double: -; SSE: # BB#0: -; SSE-NEXT: subsd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_sub_double: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = fsub double %arg1, %arg2 - ret double %ret -} - -define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { -; SSE-LABEL: test_add_v4i32: -; SSE: # BB#0: -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_add_v4i32: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = add <4 x i32> %arg1, %arg2 - ret <4 x i32> %ret -} - -define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { -; SSE-LABEL: test_sub_v4i32: -; SSE: # BB#0: -; SSE-NEXT: psubd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_sub_v4i32: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = sub <4 x i32> %arg1, %arg2 - ret <4 x i32> %ret -} - -define <4 x float> @test_add_v4f32(<4 x float> %arg1, <4 x float> %arg2) { -; SSE-LABEL: test_add_v4f32: -; SSE: # BB#0: -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_add_v4f32: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = fadd <4 x float> %arg1, %arg2 - ret <4 x float> %ret -} - -define <4 x float> @test_sub_v4f32(<4 x float> %arg1, <4 x float> %arg2) { -; SSE-LABEL: test_sub_v4f32: -; SSE: # BB#0: -; SSE-NEXT: subps %xmm1, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_sub_v4f32: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 -; ALL_AVX-NEXT: retq - %ret = fsub <4 x float> %arg1, %arg2 - ret <4 x float> %ret -} - -define i32 @test_copy_float(float %val) { -; SSE-LABEL: test_copy_float: -; SSE: # BB#0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_copy_float: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovd %xmm0, %eax -; ALL_AVX-NEXT: retq - %r = bitcast float %val to i32 - ret i32 %r -} - -define float @test_copy_i32(i32 %val) { -; SSE-LABEL: test_copy_i32: -; SSE: # BB#0: -; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_copy_i32: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovd %edi, %xmm0 -; ALL_AVX-NEXT: retq - %r = bitcast i32 %val to float - ret float %r -} - diff --git a/test/CodeGen/X86/GlobalISel/binop.ll b/test/CodeGen/X86/GlobalISel/binop.ll new file mode 100644 index 000000000000..8499dd958447 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/binop.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL + +define i64 @test_add_i64(i64 %arg1, i64 %arg2) { +; ALL-LABEL: test_add_i64: +; ALL: # BB#0: +; ALL-NEXT: leaq (%rsi,%rdi), %rax +; ALL-NEXT: retq + %ret = add i64 %arg1, %arg2 + ret i64 %ret +} + +define i32 @test_add_i32(i32 %arg1, i32 %arg2) { +; ALL-LABEL: test_add_i32: +; ALL: # BB#0: +; ALL-NEXT: # kill: %EDI %EDI %RDI +; ALL-NEXT: # kill: %ESI %ESI %RSI +; ALL-NEXT: leal (%rsi,%rdi), %eax +; ALL-NEXT: retq + %ret = add i32 %arg1, %arg2 + ret i32 %ret +} + +define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { +; ALL-LABEL: test_sub_i64: +; ALL: # BB#0: +; ALL-NEXT: subq %rsi, %rdi +; ALL-NEXT: movq %rdi, %rax +; ALL-NEXT: retq + %ret = sub i64 %arg1, %arg2 + ret i64 %ret +} + +define i32 @test_sub_i32(i32 %arg1, i32 %arg2) { +; ALL-LABEL: test_sub_i32: +; ALL: # BB#0: +; ALL-NEXT: subl %esi, %edi +; ALL-NEXT: movl %edi, %eax +; ALL-NEXT: retq + %ret = sub i32 %arg1, %arg2 + ret i32 %ret +} + +define float @test_add_float(float %arg1, float %arg2) { +; SSE-LABEL: test_add_float: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_add_float: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = fadd float %arg1, %arg2 + ret float %ret +} + +define double @test_add_double(double %arg1, double %arg2) { +; SSE-LABEL: test_add_double: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_add_double: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = fadd double %arg1, %arg2 + ret double %ret +} + +define float @test_sub_float(float %arg1, float %arg2) { +; SSE-LABEL: test_sub_float: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_sub_float: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = fsub float %arg1, %arg2 + ret float %ret +} + +define double @test_sub_double(double %arg1, double %arg2) { +; SSE-LABEL: test_sub_double: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_sub_double: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = fsub double %arg1, %arg2 + ret double %ret +} + +define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { +; SSE-LABEL: test_add_v4i32: +; SSE: # BB#0: +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_add_v4i32: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = add <4 x i32> %arg1, %arg2 + ret <4 x i32> %ret +} + +define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { +; SSE-LABEL: test_sub_v4i32: +; SSE: # BB#0: +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_sub_v4i32: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = sub <4 x i32> %arg1, %arg2 + ret <4 x i32> %ret +} + +define <4 x float> @test_add_v4f32(<4 x float> %arg1, <4 x float> %arg2) { +; SSE-LABEL: test_add_v4f32: +; SSE: # BB#0: +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_add_v4f32: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = fadd <4 x float> %arg1, %arg2 + ret <4 x float> %ret +} + +define <4 x float> @test_sub_v4f32(<4 x float> %arg1, <4 x float> %arg2) { +; SSE-LABEL: test_sub_v4f32: +; SSE: # BB#0: +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_sub_v4f32: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; ALL_AVX-NEXT: retq + %ret = fsub <4 x float> %arg1, %arg2 + ret <4 x float> %ret +} + +define i32 @test_copy_float(float %val) { +; SSE-LABEL: test_copy_float: +; SSE: # BB#0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_copy_float: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vmovd %xmm0, %eax +; ALL_AVX-NEXT: retq + %r = bitcast float %val to i32 + ret i32 %r +} + +define float @test_copy_i32(i32 %val) { +; SSE-LABEL: test_copy_i32: +; SSE: # BB#0: +; SSE-NEXT: movd %edi, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_copy_i32: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vmovd %edi, %xmm0 +; ALL_AVX-NEXT: retq + %r = bitcast i32 %val to float + ret float %r +} + diff --git a/test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir b/test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir deleted file mode 100644 index 2fa9ac23a7af..000000000000 --- a/test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir +++ /dev/null @@ -1,36 +0,0 @@ -# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64 -# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32 -# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ABI - ---- | - define i32* @allocai32() { - %ptr1 = alloca i32 - ret i32* %ptr1 - } - -... ---- -name: allocai32 -legalized: true -regBankSelected: true -selected: false -# CHECK-LABEL: name: allocai32 -# CHECK: registers: -# CHECK-X32: - { id: 0, class: gr32 } -# CHECK-X32ABI: - { id: 0, class: gr32 } -# CHECK-X64: - { id: 0, class: gr64 } -registers: - - { id: 0, class: gpr } -stack: - - { id: 0, name: ptr1, offset: 0, size: 4, alignment: 4 } - -# CHECK-X32: %0 = LEA32r %stack.0.ptr1, 1, _, 0, _ -# CHECK-X32ABI: %0 = LEA64_32r %stack.0.ptr1, 1, _, 0, _ -# CHECK-X64: %0 = LEA64r %stack.0.ptr1, 1, _, 0, _ -body: | - bb.1 (%ir-block.0): - %0(p0) = G_FRAME_INDEX %stack.0.ptr1 - %eax = COPY %0(p0) - RET 0, implicit %eax - -... diff --git a/test/CodeGen/X86/GlobalISel/legalize-const.mir b/test/CodeGen/X86/GlobalISel/legalize-const.mir deleted file mode 100644 index 612d33a77fc9..000000000000 --- a/test/CodeGen/X86/GlobalISel/legalize-const.mir +++ /dev/null @@ -1,43 +0,0 @@ -# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 -# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 - ---- | - define void @constInt_check() { - ret void - } - -... ---- -name: constInt_check -# ALL-LABEL: name: constInt_check -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } - - { id: 4, class: _ } -body: | - bb.1 (%ir-block.0): - ; ALL: %5(s8) = G_CONSTANT i8 -1 - ; ALL: %0(s1) = G_TRUNC %5(s8) - %0(s1) = G_CONSTANT i1 1 - - ; ALL: %1(s8) = G_CONSTANT i8 8 - %1(s8) = G_CONSTANT i8 8 - - ; ALL: %2(s16) = G_CONSTANT i16 16 - %2(s16) = G_CONSTANT i16 16 - - ; ALL: %3(s32) = G_CONSTANT i32 32 - %3(s32) = G_CONSTANT i32 32 - - ; X64: %4(s64) = G_CONSTANT i64 64 - - ; X32: %6(s32) = G_CONSTANT i32 64 - ; X32: %7(s32) = G_CONSTANT i32 0 - ; X32: %4(s64) = G_MERGE_VALUES %6(s32), %7(s32) - %4(s64) = G_CONSTANT i64 64 - - RET 0 -... - diff --git a/test/CodeGen/X86/GlobalISel/legalize-constant.mir b/test/CodeGen/X86/GlobalISel/legalize-constant.mir new file mode 100644 index 000000000000..612d33a77fc9 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/legalize-constant.mir @@ -0,0 +1,43 @@ +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +--- | + define void @constInt_check() { + ret void + } + +... +--- +name: constInt_check +# ALL-LABEL: name: constInt_check +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } +body: | + bb.1 (%ir-block.0): + ; ALL: %5(s8) = G_CONSTANT i8 -1 + ; ALL: %0(s1) = G_TRUNC %5(s8) + %0(s1) = G_CONSTANT i1 1 + + ; ALL: %1(s8) = G_CONSTANT i8 8 + %1(s8) = G_CONSTANT i8 8 + + ; ALL: %2(s16) = G_CONSTANT i16 16 + %2(s16) = G_CONSTANT i16 16 + + ; ALL: %3(s32) = G_CONSTANT i32 32 + %3(s32) = G_CONSTANT i32 32 + + ; X64: %4(s64) = G_CONSTANT i64 64 + + ; X32: %6(s32) = G_CONSTANT i32 64 + ; X32: %7(s32) = G_CONSTANT i32 0 + ; X32: %4(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %4(s64) = G_CONSTANT i64 64 + + RET 0 +... + diff --git a/test/CodeGen/X86/GlobalISel/legalize-trunc.mir b/test/CodeGen/X86/GlobalISel/legalize-trunc.mir new file mode 100644 index 000000000000..6b390d990ecf --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/legalize-trunc.mir @@ -0,0 +1,31 @@ +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +--- | + define void @trunc_check() { + ret void + } + +... +--- +name: trunc_check +# ALL-LABEL: name: trunc_check +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.1 (%ir-block.0): + %0(s32) = IMPLICIT_DEF + ; ALL: %1(s1) = G_TRUNC %0(s32) + %1(s1) = G_TRUNC %0(s32) + + ; ALL: %2(s8) = G_TRUNC %0(s32) + %2(s8) = G_TRUNC %0(s32) + + ; ALL: %3(s16) = G_TRUNC %0(s32) + %3(s16) = G_TRUNC %0(s32) + RET 0 + +... + diff --git a/test/CodeGen/X86/GlobalISel/memop-isel.ll b/test/CodeGen/X86/GlobalISel/memop-isel.ll deleted file mode 100644 index 6fe66436e4a8..000000000000 --- a/test/CodeGen/X86/GlobalISel/memop-isel.ll +++ /dev/null @@ -1,189 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512F_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512F_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512VL_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512VL_GREEDY - - -define i8 @test_load_i8(i8 * %p1) { -; ALL-LABEL: test_load_i8: -; ALL: # BB#0: -; ALL-NEXT: movb (%rdi), %al -; ALL-NEXT: retq - %r = load i8, i8* %p1 - ret i8 %r -} - -define i16 @test_load_i16(i16 * %p1) { -; ALL-LABEL: test_load_i16: -; ALL: # BB#0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: retq - %r = load i16, i16* %p1 - ret i16 %r -} - -define i32 @test_load_i32(i32 * %p1) { -; ALL-LABEL: test_load_i32: -; ALL: # BB#0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: retq - %r = load i32, i32* %p1 - ret i32 %r -} - -define i64 @test_load_i64(i64 * %p1) { -; ALL-LABEL: test_load_i64: -; ALL: # BB#0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: retq - %r = load i64, i64* %p1 - ret i64 %r -} - -define float @test_load_float(float * %p1) { -; SSE-LABEL: test_load_float: -; SSE: # BB#0: -; SSE-NEXT: movl (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_float: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: movl (%rdi), %eax -; ALL_AVX-NEXT: vmovd %eax, %xmm0 -; ALL_AVX-NEXT: retq - %r = load float, float* %p1 - ret float %r -} - -define double @test_load_double(double * %p1) { -; SSE-LABEL: test_load_double: -; SSE: # BB#0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_double: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: movq (%rdi), %rax -; ALL_AVX-NEXT: vmovq %rax, %xmm0 -; ALL_AVX-NEXT: retq - %r = load double, double* %p1 - ret double %r -} - -define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { -; SSE-LABEL: test_load_v4i32_noalign: -; SSE: # BB#0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_v4i32_noalign: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovups (%rdi), %xmm0 -; ALL_AVX-NEXT: retq - %r = load <4 x i32>, <4 x i32>* %p1, align 1 - ret <4 x i32> %r -} - -define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { -; SSE-LABEL: test_load_v4i32_align: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_v4i32_align: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovaps (%rdi), %xmm0 -; ALL_AVX-NEXT: retq - %r = load <4 x i32>, <4 x i32>* %p1, align 16 - ret <4 x i32> %r -} - -define i32 * @test_store_i32(i32 %val, i32 * %p1) { -; ALL-LABEL: test_store_i32: -; ALL: # BB#0: -; ALL-NEXT: movl %edi, (%rsi) -; ALL-NEXT: movq %rsi, %rax -; ALL-NEXT: retq - store i32 %val, i32* %p1 - ret i32 * %p1; -} - -define i64 * @test_store_i64(i64 %val, i64 * %p1) { -; ALL-LABEL: test_store_i64: -; ALL: # BB#0: -; ALL-NEXT: movq %rdi, (%rsi) -; ALL-NEXT: movq %rsi, %rax -; ALL-NEXT: retq - store i64 %val, i64* %p1 - ret i64 * %p1; -} - -define float * @test_store_float(float %val, float * %p1) { -; -; SSE_FAST-LABEL: test_store_float: -; SSE_FAST: # BB#0: -; SSE_FAST-NEXT: movd %xmm0, %eax -; SSE_FAST-NEXT: movl %eax, (%rdi) -; SSE_FAST-NEXT: movq %rdi, %rax -; SSE_FAST-NEXT: retq -; -; SSE_GREEDY-LABEL: test_store_float: -; SSE_GREEDY: # BB#0: -; SSE_GREEDY-NEXT: movss %xmm0, (%rdi) -; SSE_GREEDY-NEXT: movq %rdi, %rax -; SSE_GREEDY-NEXT: retq -; -; ALL_AVX_FAST-LABEL: test_store_float: -; ALL_AVX_FAST: # BB#0: -; ALL_AVX_FAST-NEXT: vmovd %xmm0, %eax -; ALL_AVX_FAST-NEXT: movl %eax, (%rdi) -; ALL_AVX_FAST-NEXT: movq %rdi, %rax -; ALL_AVX_FAST-NEXT: retq -; -; ALL_AVX_GREEDY-LABEL: test_store_float: -; ALL_AVX_GREEDY: # BB#0: -; ALL_AVX_GREEDY-NEXT: vmovss %xmm0, (%rdi) -; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax -; ALL_AVX_GREEDY-NEXT: retq - store float %val, float* %p1 - ret float * %p1; -} - -define double * @test_store_double(double %val, double * %p1) { -; -; SSE_FAST-LABEL: test_store_double: -; SSE_FAST: # BB#0: -; SSE_FAST-NEXT: movd %xmm0, %rax -; SSE_FAST-NEXT: movq %rax, (%rdi) -; SSE_FAST-NEXT: movq %rdi, %rax -; SSE_FAST-NEXT: retq -; -; SSE_GREEDY-LABEL: test_store_double: -; SSE_GREEDY: # BB#0: -; SSE_GREEDY-NEXT: movsd %xmm0, (%rdi) -; SSE_GREEDY-NEXT: movq %rdi, %rax -; SSE_GREEDY-NEXT: retq -; -; ALL_AVX_FAST-LABEL: test_store_double: -; ALL_AVX_FAST: # BB#0: -; ALL_AVX_FAST-NEXT: vmovq %xmm0, %rax -; ALL_AVX_FAST-NEXT: movq %rax, (%rdi) -; ALL_AVX_FAST-NEXT: movq %rdi, %rax -; ALL_AVX_FAST-NEXT: retq -; -; ALL_AVX_GREEDY-LABEL: test_store_double: -; ALL_AVX_GREEDY: # BB#0: -; ALL_AVX_GREEDY-NEXT: vmovsd %xmm0, (%rdi) -; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax -; ALL_AVX_GREEDY-NEXT: retq - store double %val, double* %p1 - ret double * %p1; -} - diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop.ll new file mode 100644 index 000000000000..6fe66436e4a8 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/memop.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST +; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX_FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX_GREEDY +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512F_FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512F_GREEDY +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512VL_FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512VL_GREEDY + + +define i8 @test_load_i8(i8 * %p1) { +; ALL-LABEL: test_load_i8: +; ALL: # BB#0: +; ALL-NEXT: movb (%rdi), %al +; ALL-NEXT: retq + %r = load i8, i8* %p1 + ret i8 %r +} + +define i16 @test_load_i16(i16 * %p1) { +; ALL-LABEL: test_load_i16: +; ALL: # BB#0: +; ALL-NEXT: movzwl (%rdi), %eax +; ALL-NEXT: retq + %r = load i16, i16* %p1 + ret i16 %r +} + +define i32 @test_load_i32(i32 * %p1) { +; ALL-LABEL: test_load_i32: +; ALL: # BB#0: +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: retq + %r = load i32, i32* %p1 + ret i32 %r +} + +define i64 @test_load_i64(i64 * %p1) { +; ALL-LABEL: test_load_i64: +; ALL: # BB#0: +; ALL-NEXT: movq (%rdi), %rax +; ALL-NEXT: retq + %r = load i64, i64* %p1 + ret i64 %r +} + +define float @test_load_float(float * %p1) { +; SSE-LABEL: test_load_float: +; SSE: # BB#0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_load_float: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: movl (%rdi), %eax +; ALL_AVX-NEXT: vmovd %eax, %xmm0 +; ALL_AVX-NEXT: retq + %r = load float, float* %p1 + ret float %r +} + +define double @test_load_double(double * %p1) { +; SSE-LABEL: test_load_double: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_load_double: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: movq (%rdi), %rax +; ALL_AVX-NEXT: vmovq %rax, %xmm0 +; ALL_AVX-NEXT: retq + %r = load double, double* %p1 + ret double %r +} + +define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { +; SSE-LABEL: test_load_v4i32_noalign: +; SSE: # BB#0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_load_v4i32_noalign: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vmovups (%rdi), %xmm0 +; ALL_AVX-NEXT: retq + %r = load <4 x i32>, <4 x i32>* %p1, align 1 + ret <4 x i32> %r +} + +define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { +; SSE-LABEL: test_load_v4i32_align: +; SSE: # BB#0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_load_v4i32_align: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: vmovaps (%rdi), %xmm0 +; ALL_AVX-NEXT: retq + %r = load <4 x i32>, <4 x i32>* %p1, align 16 + ret <4 x i32> %r +} + +define i32 * @test_store_i32(i32 %val, i32 * %p1) { +; ALL-LABEL: test_store_i32: +; ALL: # BB#0: +; ALL-NEXT: movl %edi, (%rsi) +; ALL-NEXT: movq %rsi, %rax +; ALL-NEXT: retq + store i32 %val, i32* %p1 + ret i32 * %p1; +} + +define i64 * @test_store_i64(i64 %val, i64 * %p1) { +; ALL-LABEL: test_store_i64: +; ALL: # BB#0: +; ALL-NEXT: movq %rdi, (%rsi) +; ALL-NEXT: movq %rsi, %rax +; ALL-NEXT: retq + store i64 %val, i64* %p1 + ret i64 * %p1; +} + +define float * @test_store_float(float %val, float * %p1) { +; +; SSE_FAST-LABEL: test_store_float: +; SSE_FAST: # BB#0: +; SSE_FAST-NEXT: movd %xmm0, %eax +; SSE_FAST-NEXT: movl %eax, (%rdi) +; SSE_FAST-NEXT: movq %rdi, %rax +; SSE_FAST-NEXT: retq +; +; SSE_GREEDY-LABEL: test_store_float: +; SSE_GREEDY: # BB#0: +; SSE_GREEDY-NEXT: movss %xmm0, (%rdi) +; SSE_GREEDY-NEXT: movq %rdi, %rax +; SSE_GREEDY-NEXT: retq +; +; ALL_AVX_FAST-LABEL: test_store_float: +; ALL_AVX_FAST: # BB#0: +; ALL_AVX_FAST-NEXT: vmovd %xmm0, %eax +; ALL_AVX_FAST-NEXT: movl %eax, (%rdi) +; ALL_AVX_FAST-NEXT: movq %rdi, %rax +; ALL_AVX_FAST-NEXT: retq +; +; ALL_AVX_GREEDY-LABEL: test_store_float: +; ALL_AVX_GREEDY: # BB#0: +; ALL_AVX_GREEDY-NEXT: vmovss %xmm0, (%rdi) +; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax +; ALL_AVX_GREEDY-NEXT: retq + store float %val, float* %p1 + ret float * %p1; +} + +define double * @test_store_double(double %val, double * %p1) { +; +; SSE_FAST-LABEL: test_store_double: +; SSE_FAST: # BB#0: +; SSE_FAST-NEXT: movd %xmm0, %rax +; SSE_FAST-NEXT: movq %rax, (%rdi) +; SSE_FAST-NEXT: movq %rdi, %rax +; SSE_FAST-NEXT: retq +; +; SSE_GREEDY-LABEL: test_store_double: +; SSE_GREEDY: # BB#0: +; SSE_GREEDY-NEXT: movsd %xmm0, (%rdi) +; SSE_GREEDY-NEXT: movq %rdi, %rax +; SSE_GREEDY-NEXT: retq +; +; ALL_AVX_FAST-LABEL: test_store_double: +; ALL_AVX_FAST: # BB#0: +; ALL_AVX_FAST-NEXT: vmovq %xmm0, %rax +; ALL_AVX_FAST-NEXT: movq %rax, (%rdi) +; ALL_AVX_FAST-NEXT: movq %rdi, %rax +; ALL_AVX_FAST-NEXT: retq +; +; ALL_AVX_GREEDY-LABEL: test_store_double: +; ALL_AVX_GREEDY: # BB#0: +; ALL_AVX_GREEDY-NEXT: vmovsd %xmm0, (%rdi) +; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax +; ALL_AVX_GREEDY-NEXT: retq + store double %val, double* %p1 + ret double * %p1; +} + diff --git a/test/CodeGen/X86/GlobalISel/select-add.mir b/test/CodeGen/X86/GlobalISel/select-add.mir new file mode 100644 index 000000000000..27fcc223d2bb --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-add.mir @@ -0,0 +1,226 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + +--- | + define i64 @test_add_i64(i64 %arg1, i64 %arg2) { + %ret = add i64 %arg1, %arg2 + ret i64 %ret + } + + define i32 @test_add_i32(i32 %arg1, i32 %arg2) { + %ret = add i32 %arg1, %arg2 + ret i32 %ret + } + + define float @test_add_float(float %arg1, float %arg2) { + %ret = fadd float %arg1, %arg2 + ret float %ret + } + + define double @test_add_double(double %arg1, double %arg2) { + %ret = fadd double %arg1, %arg2 + ret double %ret + } + + define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { + %ret = add <4 x i32> %arg1, %arg2 + ret <4 x i32> %ret + } + + define <4 x float> @test_add_v4f32(<4 x float> %arg1, <4 x float> %arg2) { + %ret = fadd <4 x float> %arg1, %arg2 + ret <4 x float> %ret + } +... + +--- +name: test_add_i64 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr64 } +# ALL-NEXT: - { id: 1, class: gr64 } +# ALL-NEXT: - { id: 2, class: gr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %rdi +# ALL-NEXT: %1 = COPY %rsi +# ALL-NEXT: %2 = ADD64rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s64) = G_ADD %0, %1 + %rax = COPY %2(s64) + +... + +--- +name: test_add_i32 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %edi +# ALL-NEXT: %1 = COPY %esi +# ALL-NEXT: %2 = ADD32rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s32) = G_ADD %0, %1 + %rax = COPY %2(s32) + +... +--- +name: test_add_float +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512F-NEXT: - { id: 0, class: fr32 } +# NO_AVX512F-NEXT: - { id: 1, class: fr32 } +# NO_AVX512F-NEXT: - { id: 2, class: fr32 } +# AVX512ALL-NEXT: - { id: 0, class: fr32x } +# AVX512ALL-NEXT: - { id: 1, class: fr32x } +# AVX512ALL-NEXT: - { id: 2, class: fr32x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = ADDSSrr %0, %1 +# AVX-NEXT: %2 = VADDSSrr %0, %1 +# AVX512F-NEXT: %2 = VADDSSZrr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(s32) = COPY %xmm0 + %1(s32) = COPY %xmm1 + %2(s32) = G_FADD %0, %1 + %xmm0 = COPY %2(s32) + RET 0, implicit %xmm0 + +... +--- +name: test_add_double +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512F-NEXT: - { id: 0, class: fr64 } +# NO_AVX512F-NEXT: - { id: 1, class: fr64 } +# NO_AVX512F-NEXT: - { id: 2, class: fr64 } +# AVX512ALL-NEXT: - { id: 0, class: fr64x } +# AVX512ALL-NEXT: - { id: 1, class: fr64x } +# AVX512ALL-NEXT: - { id: 2, class: fr64x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = ADDSDrr %0, %1 +# AVX-NEXT: %2 = VADDSDrr %0, %1 +# AVX512F-NEXT: %2 = VADDSDZrr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(s64) = COPY %xmm0 + %1(s64) = COPY %xmm1 + %2(s64) = G_FADD %0, %1 + %xmm0 = COPY %2(s64) + RET 0, implicit %xmm0 + +... +--- +name: test_add_v4i32 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } +# AVX512VL-NEXT: - { id: 0, class: vr128x } +# AVX512VL-NEXT: - { id: 1, class: vr128x } +# AVX512VL-NEXT: - { id: 2, class: vr128x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = PADDDrr %0, %1 +# AVX-NEXT: %2 = VPADDDrr %0, %1 +# AVX512F-NEXT: %2 = VPADDDrr %0, %1 +# AVX512VL-NEXT: %2 = VPADDDZ128rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(<4 x s32>) = COPY %xmm0 + %1(<4 x s32>) = COPY %xmm1 + %2(<4 x s32>) = G_ADD %0, %1 + %xmm0 = COPY %2(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +name: test_add_v4f32 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } +# AVX512VL-NEXT: - { id: 0, class: vr128x } +# AVX512VL-NEXT: - { id: 1, class: vr128x } +# AVX512VL-NEXT: - { id: 2, class: vr128x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = ADDPSrr %0, %1 +# AVX-NEXT: %2 = VADDPSrr %0, %1 +# AVX512F-NEXT: %2 = VADDPSrr %0, %1 +# AVX512VL-NEXT: %2 = VADDPSZ128rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(<4 x s32>) = COPY %xmm0 + %1(<4 x s32>) = COPY %xmm1 + %2(<4 x s32>) = G_FADD %0, %1 + %xmm0 = COPY %2(<4 x s32>) + RET 0, implicit %xmm0 + +... diff --git a/test/CodeGen/X86/GlobalISel/select-frameIndex.mir b/test/CodeGen/X86/GlobalISel/select-frameIndex.mir new file mode 100644 index 000000000000..2fa9ac23a7af --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-frameIndex.mir @@ -0,0 +1,36 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32 +# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ABI + +--- | + define i32* @allocai32() { + %ptr1 = alloca i32 + ret i32* %ptr1 + } + +... +--- +name: allocai32 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: allocai32 +# CHECK: registers: +# CHECK-X32: - { id: 0, class: gr32 } +# CHECK-X32ABI: - { id: 0, class: gr32 } +# CHECK-X64: - { id: 0, class: gr64 } +registers: + - { id: 0, class: gpr } +stack: + - { id: 0, name: ptr1, offset: 0, size: 4, alignment: 4 } + +# CHECK-X32: %0 = LEA32r %stack.0.ptr1, 1, _, 0, _ +# CHECK-X32ABI: %0 = LEA64_32r %stack.0.ptr1, 1, _, 0, _ +# CHECK-X64: %0 = LEA64r %stack.0.ptr1, 1, _, 0, _ +body: | + bb.1 (%ir-block.0): + %0(p0) = G_FRAME_INDEX %stack.0.ptr1 + %eax = COPY %0(p0) + RET 0, implicit %eax + +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop.mir b/test/CodeGen/X86/GlobalISel/select-memop.mir new file mode 100644 index 000000000000..943c9aceb4d1 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop.mir @@ -0,0 +1,582 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + +--- | + define i8 @test_load_i8(i8* %p1) { + %r = load i8, i8* %p1 + ret i8 %r + } + + define i16 @test_load_i16(i16* %p1) { + %r = load i16, i16* %p1 + ret i16 %r + } + + define i32 @test_load_i32(i32* %p1) { + %r = load i32, i32* %p1 + ret i32 %r + } + + define i64 @test_load_i64(i64* %p1) { + %r = load i64, i64* %p1 + ret i64 %r + } + + define float @test_load_float(float* %p1) { + %r = load float, float* %p1 + ret float %r + } + + define float @test_load_float_vecreg(float* %p1) { + %r = load float, float* %p1 + ret float %r + } + + + define double @test_load_double(double* %p1) { + %r = load double, double* %p1 + ret double %r + } + + define double @test_load_double_vecreg(double* %p1) { + %r = load double, double* %p1 + ret double %r + } + + define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) { + %r = load <4 x i32>, <4 x i32>* %p1, align 1 + ret <4 x i32> %r + } + + define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) { + %r = load <4 x i32>, <4 x i32>* %p1, align 16 + ret <4 x i32> %r + } + + define i32* @test_store_i32(i32 %val, i32* %p1) { + store i32 %val, i32* %p1 + ret i32* %p1 + } + + define i64* @test_store_i64(i64 %val, i64* %p1) { + store i64 %val, i64* %p1 + ret i64* %p1 + } + + define float* @test_store_float(float %val, float* %p1) { + store float %val, float* %p1 + ret float* %p1 + } + + define float* @test_store_float_vec(float %val, float* %p1) { + store float %val, float* %p1 + ret float* %p1 + } + + define double* @test_store_double(double %val, double* %p1) { + store double %val, double* %p1 + ret double* %p1 + } + + define double* @test_store_double_vec(double %val, double* %p1) { + store double %val, double* %p1 + ret double* %p1 + } + + define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { + store <4 x i32> %val, <4 x i32>* %p1, align 16 + ret <4 x i32>* %p1 + } + + define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { + store <4 x i32> %val, <4 x i32>* %p1, align 1 + ret <4 x i32>* %p1 + } + +... +--- +# ALL-LABEL: name: test_load_i8 +name: test_load_i8 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr8 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1) +# ALL: %al = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1) + %al = COPY %1(s8) + RET 0, implicit %al + +... +--- +# ALL-LABEL: name: test_load_i16 +name: test_load_i16 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr16 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1) +# ALL: %ax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- +# ALL-LABEL: name: test_load_i32 +name: test_load_i32 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr32 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# ALL: %eax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) + %eax = COPY %1(s32) + RET 0, implicit %eax + +... +--- +# ALL-LABEL: name: test_load_i64 +name: test_load_i64 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) + %rax = COPY %1(s64) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_load_float +name: test_load_float +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr32 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) + %xmm0 = COPY %1(s32) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_float_vecreg +name: test_load_float_vecreg +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: fr32 } +# AVX512ALL: - { id: 1, class: fr32x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# AVX: %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) + %xmm0 = COPY %1(s32) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_double +name: test_load_double +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) + %xmm0 = COPY %1(s64) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_double_vecreg +name: test_load_double_vecreg +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: fr64 } +# AVX512ALL: - { id: 1, class: fr64x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# AVX: %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) + %xmm0 = COPY %1(s64) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_v4i32_noalign +name: test_load_v4i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: vr128 } +# AVX512ALL: - { id: 1, class: vr128x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1) + %xmm0 = COPY %1(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_v4i32_align +name: test_load_v4i32_align +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: vr128 } +# AVX512ALL: - { id: 1, class: vr128x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1) + %xmm0 = COPY %1(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_store_i32 +name: test_store_i32 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr32 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %edi +# ALL: %1 = COPY %rsi +# ALL: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %edi, %rsi + + %0(s32) = COPY %edi + %1(p0) = COPY %rsi + G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_i64 +name: test_store_i64 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = COPY %rsi +# ALL: MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(p0) = COPY %rsi + G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_float +name: test_store_float +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: fr32x } +# ALL: - { id: 1, class: gr64 } +# ALL: - { id: 2, class: gr32 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# ALL: %2 = COPY %0 +# ALL: MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s32) = COPY %xmm0 + %1(p0) = COPY %rdi + %2(s32) = COPY %0(s32) + G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_float_vec +name: test_store_float_vec +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: fr32 } +# AVX512ALL: - { id: 0, class: fr32x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# AVX: VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s32) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_double +name: test_store_double +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: fr64x } +# ALL: - { id: 1, class: gr64 } +# ALL: - { id: 2, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# ALL: %2 = COPY %0 +# ALL: MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s64) = COPY %xmm0 + %1(p0) = COPY %rdi + %2(s64) = COPY %0(s64) + G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_double_vec +name: test_store_double_vec +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: fr64 } +# AVX512ALL: - { id: 0, class: fr64x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# AVX: VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s64) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_v4i32_align +name: test_store_v4i32_align +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: vr128 } +# AVX512ALL: - { id: 0, class: vr128x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(<4 x s32>) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_v4i32_noalign +name: test_store_v4i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: vr128 } +# AVX512ALL: - { id: 0, class: vr128x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(<4 x s32>) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... diff --git a/test/CodeGen/X86/GlobalISel/select-sub.mir b/test/CodeGen/X86/GlobalISel/select-sub.mir new file mode 100644 index 000000000000..d4db6eec6d80 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-sub.mir @@ -0,0 +1,225 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + +--- | + define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { + %ret = sub i64 %arg1, %arg2 + ret i64 %ret + } + + define i32 @test_sub_i32(i32 %arg1, i32 %arg2) { + %ret = sub i32 %arg1, %arg2 + ret i32 %ret + } + + define float @test_sub_float(float %arg1, float %arg2) { + %ret = fsub float %arg1, %arg2 + ret float %ret + } + + define double @test_sub_double(double %arg1, double %arg2) { + %ret = fsub double %arg1, %arg2 + ret double %ret + } + + define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { + %ret = sub <4 x i32> %arg1, %arg2 + ret <4 x i32> %ret + } + + define <4 x float> @test_sub_v4f32(<4 x float> %arg1, <4 x float> %arg2) { + %ret = fsub <4 x float> %arg1, %arg2 + ret <4 x float> %ret + } + +... +--- +name: test_sub_i64 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr64 } +# ALL-NEXT: - { id: 1, class: gr64 } +# ALL-NEXT: - { id: 2, class: gr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %rdi +# ALL-NEXT: %1 = COPY %rsi +# ALL-NEXT: %2 = SUB64rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s64) = G_SUB %0, %1 + %rax = COPY %2(s64) + +... + +--- +name: test_sub_i32 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %edi +# ALL-NEXT: %1 = COPY %esi +# ALL-NEXT: %2 = SUB32rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s32) = G_SUB %0, %1 + %rax = COPY %2(s32) + +... +--- +name: test_sub_float +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512F-NEXT: - { id: 0, class: fr32 } +# NO_AVX512F-NEXT: - { id: 1, class: fr32 } +# NO_AVX512F-NEXT: - { id: 2, class: fr32 } +# AVX512ALL-NEXT: - { id: 0, class: fr32x } +# AVX512ALL-NEXT: - { id: 1, class: fr32x } +# AVX512ALL-NEXT: - { id: 2, class: fr32x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = SUBSSrr %0, %1 +# AVX-NEXT: %2 = VSUBSSrr %0, %1 +# AVX512F-NEXT: %2 = VSUBSSZrr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(s32) = COPY %xmm0 + %1(s32) = COPY %xmm1 + %2(s32) = G_FSUB %0, %1 + %xmm0 = COPY %2(s32) + RET 0, implicit %xmm0 + +... +--- +name: test_sub_double +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512F-NEXT: - { id: 0, class: fr64 } +# NO_AVX512F-NEXT: - { id: 1, class: fr64 } +# NO_AVX512F-NEXT: - { id: 2, class: fr64 } +# AVX512ALL-NEXT: - { id: 0, class: fr64x } +# AVX512ALL-NEXT: - { id: 1, class: fr64x } +# AVX512ALL-NEXT: - { id: 2, class: fr64x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = SUBSDrr %0, %1 +# AVX-NEXT: %2 = VSUBSDrr %0, %1 +# AVX512F-NEXT: %2 = VSUBSDZrr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(s64) = COPY %xmm0 + %1(s64) = COPY %xmm1 + %2(s64) = G_FSUB %0, %1 + %xmm0 = COPY %2(s64) + RET 0, implicit %xmm0 +... +--- +name: test_sub_v4i32 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } +# AVX512VL-NEXT: - { id: 0, class: vr128x } +# AVX512VL-NEXT: - { id: 1, class: vr128x } +# AVX512VL-NEXT: - { id: 2, class: vr128x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = PSUBDrr %0, %1 +# AVX-NEXT: %2 = VPSUBDrr %0, %1 +# AVX512F-NEXT: %2 = VPSUBDrr %0, %1 +# AVX512VL-NEXT: %2 = VPSUBDZ128rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(<4 x s32>) = COPY %xmm0 + %1(<4 x s32>) = COPY %xmm1 + %2(<4 x s32>) = G_SUB %0, %1 + %xmm0 = COPY %2(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +name: test_sub_v4f32 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +# ALL: registers: +# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } +# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } +# AVX512VL-NEXT: - { id: 0, class: vr128x } +# AVX512VL-NEXT: - { id: 1, class: vr128x } +# AVX512VL-NEXT: - { id: 2, class: vr128x } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = COPY %xmm0 +# ALL-NEXT: %1 = COPY %xmm1 +# SSE-NEXT: %2 = SUBPSrr %0, %1 +# AVX-NEXT: %2 = VSUBPSrr %0, %1 +# AVX512F-NEXT: %2 = VSUBPSrr %0, %1 +# AVX512VL-NEXT: %2 = VSUBPSZ128rr %0, %1 +body: | + bb.1 (%ir-block.0): + liveins: %xmm0, %xmm1 + + %0(<4 x s32>) = COPY %xmm0 + %1(<4 x s32>) = COPY %xmm1 + %2(<4 x s32>) = G_FSUB %0, %1 + %xmm0 = COPY %2(<4 x s32>) + RET 0, implicit %xmm0 + +... diff --git a/test/CodeGen/X86/GlobalISel/select-trunc.mir b/test/CodeGen/X86/GlobalISel/select-trunc.mir new file mode 100644 index 000000000000..714340248ff6 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-trunc.mir @@ -0,0 +1,183 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK +--- | + define i1 @trunc_i32toi1(i32 %a) { + %r = trunc i32 %a to i1 + ret i1 %r + } + + define i8 @trunc_i32toi8(i32 %a) { + %r = trunc i32 %a to i8 + ret i8 %r + } + + define i16 @trunc_i32toi16(i32 %a) { + %r = trunc i32 %a to i16 + ret i16 %r + } + + define i8 @trunc_i64toi8(i64 %a) { + %r = trunc i64 %a to i8 + ret i8 %r + } + + define i16 @trunc_i64toi16(i64 %a) { + %r = trunc i64 %a to i16 + ret i16 %r + } + + define i32 @trunc_i64toi32(i64 %a) { + %r = trunc i64 %a to i32 + ret i32 %r + } + +... +--- +name: trunc_i32toi1 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: trunc_i32toi1 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr8 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %1 = COPY %0.sub_8 +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s32) = COPY %edi + %1(s1) = G_TRUNC %0(s32) + %al = COPY %1(s1) + RET 0, implicit %al + +... +--- +name: trunc_i32toi8 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: trunc_i32toi8 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr8 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %1 = COPY %0.sub_8 +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s32) = COPY %edi + %1(s8) = G_TRUNC %0(s32) + %al = COPY %1(s8) + RET 0, implicit %al + +... +--- +name: trunc_i32toi16 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: trunc_i32toi16 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr16 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %1 = COPY %0.sub_16 +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s32) = COPY %edi + %1(s16) = G_TRUNC %0(s32) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- +name: trunc_i64toi8 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: trunc_i64toi8 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr64 } +# CHECK-NEXT: - { id: 1, class: gr8 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %1 = COPY %0.sub_8 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(s64) = COPY %rdi + %1(s8) = G_TRUNC %0(s64) + %al = COPY %1(s8) + RET 0, implicit %al + +... +--- +name: trunc_i64toi16 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: trunc_i64toi16 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr64 } +# CHECK-NEXT: - { id: 1, class: gr16 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %1 = COPY %0.sub_16 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(s64) = COPY %rdi + %1(s16) = G_TRUNC %0(s64) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- +name: trunc_i64toi32 +alignment: 4 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: trunc_i64toi32 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr64 } +# CHECK-NEXT: - { id: 1, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# CHECK: body: +# CHECK: %1 = COPY %0.sub_32 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(s64) = COPY %rdi + %1(s32) = G_TRUNC %0(s64) + %eax = COPY %1(s32) + RET 0, implicit %eax + +... diff --git a/test/CodeGen/X86/GlobalISel/trunc.ll b/test/CodeGen/X86/GlobalISel/trunc.ll new file mode 100644 index 000000000000..a56fc3b5a87f --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/trunc.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=CHECK + +define i1 @trunc_i32toi1(i32 %a) { +; CHECK-LABEL: trunc_i32toi1: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %r = trunc i32 %a to i1 + ret i1 %r +} + +define i8 @trunc_i32toi8(i32 %a) { +; CHECK-LABEL: trunc_i32toi8: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %r = trunc i32 %a to i8 + ret i8 %r +} + +define i16 @trunc_i32toi16(i32 %a) { +; CHECK-LABEL: trunc_i32toi16: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %r = trunc i32 %a to i16 + ret i16 %r +} + +define i8 @trunc_i64toi8(i64 %a) { +; CHECK-LABEL: trunc_i64toi8: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %r = trunc i64 %a to i8 + ret i8 %r +} + +define i16 @trunc_i64toi16(i64 %a) { +; CHECK-LABEL: trunc_i64toi16: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %r = trunc i64 %a to i16 + ret i16 %r +} + +define i32 @trunc_i64toi32(i64 %a) { +; CHECK-LABEL: trunc_i64toi32: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %r = trunc i64 %a to i32 + ret i32 %r +} + diff --git a/test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir b/test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir deleted file mode 100644 index 17522c3cb45e..000000000000 --- a/test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir +++ /dev/null @@ -1,1022 +0,0 @@ -# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE -# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX -# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F -# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL - ---- | - define i64 @test_add_i64(i64 %arg1, i64 %arg2) { - %ret = add i64 %arg1, %arg2 - ret i64 %ret - } - - define i32 @test_add_i32(i32 %arg1, i32 %arg2) { - %ret = add i32 %arg1, %arg2 - ret i32 %ret - } - - define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { - %ret = sub i64 %arg1, %arg2 - ret i64 %ret - } - - define i32 @test_sub_i32(i32 %arg1, i32 %arg2) { - %ret = sub i32 %arg1, %arg2 - ret i32 %ret - } - - define float @test_add_float(float %arg1, float %arg2) { - %ret = fadd float %arg1, %arg2 - ret float %ret - } - - define double @test_add_double(double %arg1, double %arg2) { - %ret = fadd double %arg1, %arg2 - ret double %ret - } - - define float @test_sub_float(float %arg1, float %arg2) { - %ret = fsub float %arg1, %arg2 - ret float %ret - } - - define double @test_sub_double(double %arg1, double %arg2) { - %ret = fsub double %arg1, %arg2 - ret double %ret - } - - define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { - %ret = add <4 x i32> %arg1, %arg2 - ret <4 x i32> %ret - } - - define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { - %ret = sub <4 x i32> %arg1, %arg2 - ret <4 x i32> %ret - } - - define <4 x float> @test_add_v4f32(<4 x float> %arg1, <4 x float> %arg2) { - %ret = fadd <4 x float> %arg1, %arg2 - ret <4 x float> %ret - } - - define <4 x float> @test_sub_v4f32(<4 x float> %arg1, <4 x float> %arg2) { - %ret = fsub <4 x float> %arg1, %arg2 - ret <4 x float> %ret - } - - define i8 @test_load_i8(i8* %p1) { - %r = load i8, i8* %p1 - ret i8 %r - } - - define i16 @test_load_i16(i16* %p1) { - %r = load i16, i16* %p1 - ret i16 %r - } - - define i32 @test_load_i32(i32* %p1) { - %r = load i32, i32* %p1 - ret i32 %r - } - - define i64 @test_load_i64(i64* %p1) { - %r = load i64, i64* %p1 - ret i64 %r - } - - define float @test_load_float(float* %p1) { - %r = load float, float* %p1 - ret float %r - } - - define float @test_load_float_vecreg(float* %p1) { - %r = load float, float* %p1 - ret float %r - } - - - define double @test_load_double(double* %p1) { - %r = load double, double* %p1 - ret double %r - } - - define double @test_load_double_vecreg(double* %p1) { - %r = load double, double* %p1 - ret double %r - } - - define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) { - %r = load <4 x i32>, <4 x i32>* %p1, align 1 - ret <4 x i32> %r - } - - define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) { - %r = load <4 x i32>, <4 x i32>* %p1, align 16 - ret <4 x i32> %r - } - - define i32* @test_store_i32(i32 %val, i32* %p1) { - store i32 %val, i32* %p1 - ret i32* %p1 - } - - define i64* @test_store_i64(i64 %val, i64* %p1) { - store i64 %val, i64* %p1 - ret i64* %p1 - } - - define float* @test_store_float(float %val, float* %p1) { - store float %val, float* %p1 - ret float* %p1 - } - - define float* @test_store_float_vec(float %val, float* %p1) { - store float %val, float* %p1 - ret float* %p1 - } - - define double* @test_store_double(double %val, double* %p1) { - store double %val, double* %p1 - ret double* %p1 - } - - define double* @test_store_double_vec(double %val, double* %p1) { - store double %val, double* %p1 - ret double* %p1 - } - - define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { - store <4 x i32> %val, <4 x i32>* %p1, align 16 - ret <4 x i32>* %p1 - } - - define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { - store <4 x i32> %val, <4 x i32>* %p1, align 1 - ret <4 x i32>* %p1 - } - -... - ---- -name: test_add_i64 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr64 } -# ALL-NEXT: - { id: 1, class: gr64 } -# ALL-NEXT: - { id: 2, class: gr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %rdi -# ALL-NEXT: %1 = COPY %rsi -# ALL-NEXT: %2 = ADD64rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %edi, %esi - - %0(s64) = COPY %rdi - %1(s64) = COPY %rsi - %2(s64) = G_ADD %0, %1 - %rax = COPY %2(s64) - -... - ---- -name: test_add_i32 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %edi -# ALL-NEXT: %1 = COPY %esi -# ALL-NEXT: %2 = ADD32rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %edi, %esi - - %0(s32) = COPY %edi - %1(s32) = COPY %esi - %2(s32) = G_ADD %0, %1 - %rax = COPY %2(s32) - -... - ---- -name: test_sub_i64 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr64 } -# ALL-NEXT: - { id: 1, class: gr64 } -# ALL-NEXT: - { id: 2, class: gr64 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %rdi -# ALL-NEXT: %1 = COPY %rsi -# ALL-NEXT: %2 = SUB64rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %edi, %esi - - %0(s64) = COPY %rdi - %1(s64) = COPY %rsi - %2(s64) = G_SUB %0, %1 - %rax = COPY %2(s64) - -... - ---- -name: test_sub_i32 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %edi -# ALL-NEXT: %1 = COPY %esi -# ALL-NEXT: %2 = SUB32rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %edi, %esi - - %0(s32) = COPY %edi - %1(s32) = COPY %esi - %2(s32) = G_SUB %0, %1 - %rax = COPY %2(s32) - -... - ---- -name: test_add_float -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512F-NEXT: - { id: 0, class: fr32 } -# NO_AVX512F-NEXT: - { id: 1, class: fr32 } -# NO_AVX512F-NEXT: - { id: 2, class: fr32 } -# AVX512ALL-NEXT: - { id: 0, class: fr32x } -# AVX512ALL-NEXT: - { id: 1, class: fr32x } -# AVX512ALL-NEXT: - { id: 2, class: fr32x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = ADDSSrr %0, %1 -# AVX-NEXT: %2 = VADDSSrr %0, %1 -# AVX512F-NEXT: %2 = VADDSSZrr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(s32) = COPY %xmm0 - %1(s32) = COPY %xmm1 - %2(s32) = G_FADD %0, %1 - %xmm0 = COPY %2(s32) - RET 0, implicit %xmm0 - -... ---- -name: test_add_double -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512F-NEXT: - { id: 0, class: fr64 } -# NO_AVX512F-NEXT: - { id: 1, class: fr64 } -# NO_AVX512F-NEXT: - { id: 2, class: fr64 } -# AVX512ALL-NEXT: - { id: 0, class: fr64x } -# AVX512ALL-NEXT: - { id: 1, class: fr64x } -# AVX512ALL-NEXT: - { id: 2, class: fr64x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = ADDSDrr %0, %1 -# AVX-NEXT: %2 = VADDSDrr %0, %1 -# AVX512F-NEXT: %2 = VADDSDZrr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(s64) = COPY %xmm0 - %1(s64) = COPY %xmm1 - %2(s64) = G_FADD %0, %1 - %xmm0 = COPY %2(s64) - RET 0, implicit %xmm0 - -... ---- -name: test_sub_float -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512F-NEXT: - { id: 0, class: fr32 } -# NO_AVX512F-NEXT: - { id: 1, class: fr32 } -# NO_AVX512F-NEXT: - { id: 2, class: fr32 } -# AVX512ALL-NEXT: - { id: 0, class: fr32x } -# AVX512ALL-NEXT: - { id: 1, class: fr32x } -# AVX512ALL-NEXT: - { id: 2, class: fr32x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = SUBSSrr %0, %1 -# AVX-NEXT: %2 = VSUBSSrr %0, %1 -# AVX512F-NEXT: %2 = VSUBSSZrr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(s32) = COPY %xmm0 - %1(s32) = COPY %xmm1 - %2(s32) = G_FSUB %0, %1 - %xmm0 = COPY %2(s32) - RET 0, implicit %xmm0 - -... ---- -name: test_sub_double -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512F-NEXT: - { id: 0, class: fr64 } -# NO_AVX512F-NEXT: - { id: 1, class: fr64 } -# NO_AVX512F-NEXT: - { id: 2, class: fr64 } -# AVX512ALL-NEXT: - { id: 0, class: fr64x } -# AVX512ALL-NEXT: - { id: 1, class: fr64x } -# AVX512ALL-NEXT: - { id: 2, class: fr64x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = SUBSDrr %0, %1 -# AVX-NEXT: %2 = VSUBSDrr %0, %1 -# AVX512F-NEXT: %2 = VSUBSDZrr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(s64) = COPY %xmm0 - %1(s64) = COPY %xmm1 - %2(s64) = G_FSUB %0, %1 - %xmm0 = COPY %2(s64) - RET 0, implicit %xmm0 -... ---- -name: test_add_v4i32 -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } -# AVX512VL-NEXT: - { id: 0, class: vr128x } -# AVX512VL-NEXT: - { id: 1, class: vr128x } -# AVX512VL-NEXT: - { id: 2, class: vr128x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = PADDDrr %0, %1 -# AVX-NEXT: %2 = VPADDDrr %0, %1 -# AVX512F-NEXT: %2 = VPADDDrr %0, %1 -# AVX512VL-NEXT: %2 = VPADDDZ128rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(<4 x s32>) = COPY %xmm0 - %1(<4 x s32>) = COPY %xmm1 - %2(<4 x s32>) = G_ADD %0, %1 - %xmm0 = COPY %2(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -name: test_sub_v4i32 -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } -# AVX512VL-NEXT: - { id: 0, class: vr128x } -# AVX512VL-NEXT: - { id: 1, class: vr128x } -# AVX512VL-NEXT: - { id: 2, class: vr128x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = PSUBDrr %0, %1 -# AVX-NEXT: %2 = VPSUBDrr %0, %1 -# AVX512F-NEXT: %2 = VPSUBDrr %0, %1 -# AVX512VL-NEXT: %2 = VPSUBDZ128rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(<4 x s32>) = COPY %xmm0 - %1(<4 x s32>) = COPY %xmm1 - %2(<4 x s32>) = G_SUB %0, %1 - %xmm0 = COPY %2(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -name: test_add_v4f32 -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } -# AVX512VL-NEXT: - { id: 0, class: vr128x } -# AVX512VL-NEXT: - { id: 1, class: vr128x } -# AVX512VL-NEXT: - { id: 2, class: vr128x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = ADDPSrr %0, %1 -# AVX-NEXT: %2 = VADDPSrr %0, %1 -# AVX512F-NEXT: %2 = VADDPSrr %0, %1 -# AVX512VL-NEXT: %2 = VADDPSZ128rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(<4 x s32>) = COPY %xmm0 - %1(<4 x s32>) = COPY %xmm1 - %2(<4 x s32>) = G_FADD %0, %1 - %xmm0 = COPY %2(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -name: test_sub_v4f32 -alignment: 4 -legalized: true -regBankSelected: true -selected: false -tracksRegLiveness: true -# ALL: registers: -# NO_AVX512VL-NEXT: - { id: 0, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 1, class: vr128 } -# NO_AVX512VL-NEXT: - { id: 2, class: vr128 } -# AVX512VL-NEXT: - { id: 0, class: vr128x } -# AVX512VL-NEXT: - { id: 1, class: vr128x } -# AVX512VL-NEXT: - { id: 2, class: vr128x } -registers: - - { id: 0, class: vecr } - - { id: 1, class: vecr } - - { id: 2, class: vecr } -# ALL: %0 = COPY %xmm0 -# ALL-NEXT: %1 = COPY %xmm1 -# SSE-NEXT: %2 = SUBPSrr %0, %1 -# AVX-NEXT: %2 = VSUBPSrr %0, %1 -# AVX512F-NEXT: %2 = VSUBPSrr %0, %1 -# AVX512VL-NEXT: %2 = VSUBPSZ128rr %0, %1 -body: | - bb.1 (%ir-block.0): - liveins: %xmm0, %xmm1 - - %0(<4 x s32>) = COPY %xmm0 - %1(<4 x s32>) = COPY %xmm1 - %2(<4 x s32>) = G_FSUB %0, %1 - %xmm0 = COPY %2(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_i8 -name: test_load_i8 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr8 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1) -# ALL: %al = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1) - %al = COPY %1(s8) - RET 0, implicit %al - -... ---- -# ALL-LABEL: name: test_load_i16 -name: test_load_i16 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr16 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1) -# ALL: %ax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1) - %ax = COPY %1(s16) - RET 0, implicit %ax - -... ---- -# ALL-LABEL: name: test_load_i32 -name: test_load_i32 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr32 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# ALL: %eax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) - %eax = COPY %1(s32) - RET 0, implicit %eax - -... ---- -# ALL-LABEL: name: test_load_i64 -name: test_load_i64 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) - %rax = COPY %1(s64) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_load_float -name: test_load_float -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr32 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) - %xmm0 = COPY %1(s32) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_float_vecreg -name: test_load_float_vecreg -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: fr32 } -# AVX512ALL: - { id: 1, class: fr32x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# AVX: %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) - %xmm0 = COPY %1(s32) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_double -name: test_load_double -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) - %xmm0 = COPY %1(s64) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_double_vecreg -name: test_load_double_vecreg -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: fr64 } -# AVX512ALL: - { id: 1, class: fr64x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# AVX: %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) - %xmm0 = COPY %1(s64) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_v4i32_noalign -name: test_load_v4i32_noalign -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: vr128 } -# AVX512ALL: - { id: 1, class: vr128x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1) - %xmm0 = COPY %1(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_v4i32_align -name: test_load_v4i32_align -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: vr128 } -# AVX512ALL: - { id: 1, class: vr128x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1) - %xmm0 = COPY %1(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_store_i32 -name: test_store_i32 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr32 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %edi -# ALL: %1 = COPY %rsi -# ALL: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %edi, %rsi - - %0(s32) = COPY %edi - %1(p0) = COPY %rsi - G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_i64 -name: test_store_i64 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = COPY %rsi -# ALL: MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %rsi - - %0(s64) = COPY %rdi - %1(p0) = COPY %rsi - G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_float -name: test_store_float -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: fr32x } -# ALL: - { id: 1, class: gr64 } -# ALL: - { id: 2, class: gr32 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# ALL: %2 = COPY %0 -# ALL: MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s32) = COPY %xmm0 - %1(p0) = COPY %rdi - %2(s32) = COPY %0(s32) - G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_float_vec -name: test_store_float_vec -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: fr32 } -# AVX512ALL: - { id: 0, class: fr32x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# AVX: VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s32) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_double -name: test_store_double -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: fr64x } -# ALL: - { id: 1, class: gr64 } -# ALL: - { id: 2, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# ALL: %2 = COPY %0 -# ALL: MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s64) = COPY %xmm0 - %1(p0) = COPY %rdi - %2(s64) = COPY %0(s64) - G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_double_vec -name: test_store_double_vec -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: fr64 } -# AVX512ALL: - { id: 0, class: fr64x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# AVX: VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s64) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_v4i32_align -name: test_store_v4i32_align -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: vr128 } -# AVX512ALL: - { id: 0, class: vr128x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(<4 x s32>) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_v4i32_noalign -name: test_store_v4i32_noalign -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: vr128 } -# AVX512ALL: - { id: 0, class: vr128x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(<4 x s32>) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll index 4303b6254464..f89f6e1de1ab 100644 --- a/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -582,3 +582,22 @@ define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { ; CHECK-NEXT: movq %rcx, 40(%rdi) ; CHECK-NEXT: retq } + + + +; Don't let a non-consecutive store thwart merging of the last two. +define void @almost_consecutive_stores(i8* %p) { + store i8 0, i8* %p + %p1 = getelementptr i8, i8* %p, i64 42 + store i8 1, i8* %p1 + %p2 = getelementptr i8, i8* %p, i64 2 + store i8 2, i8* %p2 + %p3 = getelementptr i8, i8* %p, i64 3 + store i8 3, i8* %p3 + ret void +; CHECK-LABEL: almost_consecutive_stores +; CHECK-DAG: movb $0, (%rdi) +; CHECK-DAG: movb $1, 42(%rdi) +; CHECK-DAG: movw $770, 2(%rdi) +; CHECK: retq +} diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll index e9e7d5aea273..89abbabee27c 100644 --- a/test/CodeGen/X86/avx-logic.ll +++ b/test/CodeGen/X86/avx-logic.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { ; CHECK-LABEL: andpd256: @@ -271,3 +271,35 @@ entry: ret <2 x i64> %x } +define <4 x i32> @and_xor_splat1_v4i32(<4 x i32> %x) nounwind { +; AVX-LABEL: and_xor_splat1_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vandnps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: and_xor_splat1_v4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX512-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %xor = xor <4 x i32> %x, + %and = and <4 x i32> %xor, + ret <4 x i32> %and +} + +define <4 x i64> @and_xor_splat1_v4i64(<4 x i64> %x) nounwind { +; AVX-LABEL: and_xor_splat1_v4i64: +; AVX: # BB#0: +; AVX-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: and_xor_splat1_v4i64: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 +; AVX512-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %xor = xor <4 x i64> %x, + %and = and <4 x i64> %xor, + ret <4 x i64> %and +} + diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 796ee83b6fa7..b31b00e54e83 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -542,7 +542,7 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind re ; KNL: ## BB#0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq @@ -923,7 +923,7 @@ define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind ; KNL: ## BB#0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq @@ -1110,7 +1110,7 @@ define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind ; KNL: ## BB#0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; KNL-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq @@ -1173,7 +1173,7 @@ define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind ; KNL: ## BB#0: ; KNL-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 -; KNL-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index aec1339d653d..7103efe050a4 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1430,7 +1430,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; KNL-LABEL: store_v2i1: ; KNL: ## BB#0: -; KNL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1447,7 +1448,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; ; AVX512BW-LABEL: store_v2i1: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax @@ -1457,7 +1459,8 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; ; AVX512DQ-LABEL: store_v2i1: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rdi) @@ -1471,7 +1474,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; KNL-LABEL: store_v4i1: ; KNL: ## BB#0: -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; KNL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpslld $31, %ymm0, %ymm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1489,7 +1492,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; ; AVX512BW-LABEL: store_v4i1: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1500,7 +1503,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; ; AVX512DQ-LABEL: store_v4i1: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512DQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 diff --git a/test/CodeGen/X86/bswap_tree.ll b/test/CodeGen/X86/bswap_tree.ll new file mode 100644 index 000000000000..35a28af85579 --- /dev/null +++ b/test/CodeGen/X86/bswap_tree.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK64 + +; Check reconstructing bswap from shifted masks and tree of ORs + +; Match a 32-bit packed halfword bswap. That is +; ((x & 0x000000ff) << 8) | +; ((x & 0x0000ff00) >> 8) | +; ((x & 0x00ff0000) << 8) | +; ((x & 0xff000000) >> 8) +; => (rotl (bswap x), 16) +define i32 @test1(i32 %x) nounwind { +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: andl $-16777216, %eax # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: shrl $8, %eax +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: test1: +; CHECK64: # BB#0: +; CHECK64-NEXT: # kill: %EDI %EDI %RDI +; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: andl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: leal (%rcx,%rdi), %eax +; CHECK64-NEXT: retq + %byte0 = and i32 %x, 255 ; 0x000000ff + %byte1 = and i32 %x, 65280 ; 0x0000ff00 + %byte2 = and i32 %x, 16711680 ; 0x00ff0000 + %byte3 = and i32 %x, 4278190080 ; 0xff000000 + %tmp0 = shl i32 %byte0, 8 + %tmp1 = lshr i32 %byte1, 8 + %tmp2 = shl i32 %byte2, 8 + %tmp3 = lshr i32 %byte3, 8 + %or0 = or i32 %tmp0, %tmp1 + %or1 = or i32 %tmp2, %tmp3 + %result = or i32 %or0, %or1 + ret i32 %result +} + +; the same as test1, just shifts before the "and" +; ((x << 8) & 0x0000ff00) | +; ((x >> 8) & 0x000000ff) | +; ((x << 8) & 0xff000000) | +; ((x >> 8) & 0x00ff0000) +define i32 @test2(i32 %x) nounwind { +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: shrl $8, %eax +; CHECK-NEXT: movzwl %cx, %edx +; CHECK-NEXT: movzbl %al, %esi +; CHECK-NEXT: andl $-16777216, %ecx # imm = 0xFF000000 +; CHECK-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl +; +; CHECK64-LABEL: test2: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: shrl $8, %edi +; CHECK64-NEXT: movzwl %cx, %edx +; CHECK64-NEXT: movzbl %dil, %eax +; CHECK64-NEXT: andl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: andl $16711680, %edi # imm = 0xFF0000 +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: orl %ecx, %edi +; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: retq + %byte1 = shl i32 %x, 8 + %byte0 = lshr i32 %x, 8 + %byte3 = shl i32 %x, 8 + %byte2 = lshr i32 %x, 8 + %tmp1 = and i32 %byte1, 65280 ; 0x0000ff00 + %tmp0 = and i32 %byte0, 255 ; 0x000000ff + %tmp3 = and i32 %byte3, 4278190080 ; 0xff000000 + %tmp2 = and i32 %byte2, 16711680 ; 0x00ff0000 + %or0 = or i32 %tmp0, %tmp1 + %or1 = or i32 %tmp2, %tmp3 + %result = or i32 %or0, %or1 + ret i32 %result +} diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll new file mode 100644 index 000000000000..a9c74df9d0d9 --- /dev/null +++ b/test/CodeGen/X86/bswap_tree2.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK64 + +; Check a few invalid patterns for halfword bswap pattern matching + +; Don't match a near-miss 32-bit packed halfword bswap +; (with only half of the swap tree valid). + define i32 @test1(i32 %x) nounwind { +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: shrl $8, %eax +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: test1: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: retq + %byte0 = and i32 %x, 255 ; 0x000000ff + %byte1 = and i32 %x, 65280 ; 0x0000ff00 + %byte2 = and i32 %x, 16711680 ; 0x00ff0000 + %byte3 = or i32 %x, 4278190080 ; 0xff000000 + %tmp0 = shl i32 %byte0, 8 + %tmp1 = lshr i32 %byte1, 8 + %tmp2 = shl i32 %byte2, 8 + %tmp3 = lshr i32 %byte3, 8 + %or0 = or i32 %tmp0, %tmp1 + %or1 = or i32 %tmp2, %tmp3 + %result = or i32 %or0, %or1 + ret i32 %result +} + +; Don't match a near-miss 32-bit packed halfword bswap +; (with swapped lshr/shl) +; ((x >> 8) & 0x0000ff00) | +; ((x << 8) & 0x000000ff) | +; ((x << 8) & 0xff000000) | +; ((x >> 8) & 0x00ff0000) +define i32 @test2(i32 %x) nounwind { +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: shrl $8, %eax +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $65280, %edx # imm = 0xFF00 +; CHECK-NEXT: andl $-16777216, %ecx # imm = 0xFF000000 +; CHECK-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: test2: +; CHECK64: # BB#0: +; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: shll $8, %edi +; CHECK64-NEXT: movl %eax, %ecx +; CHECK64-NEXT: andl $65280, %ecx # imm = 0xFF00 +; CHECK64-NEXT: andl $-16777216, %edi # imm = 0xFF000000 +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: leal (%rax,%rcx), %eax +; CHECK64-NEXT: retq + %byte1 = lshr i32 %x, 8 + %byte0 = shl i32 %x, 8 + %byte3 = shl i32 %x, 8 + %byte2 = lshr i32 %x, 8 + %tmp1 = and i32 %byte1, 65280 ; 0x0000ff00 + %tmp0 = and i32 %byte0, 255 ; 0x000000ff + %tmp3 = and i32 %byte3, 4278190080 ; 0xff000000 + %tmp2 = and i32 %byte2, 16711680 ; 0x00ff0000 + %or0 = or i32 %tmp0, %tmp1 + %or1 = or i32 %tmp2, %tmp3 + %result = or i32 %or0, %or1 + ret i32 %result +} + +; Invalid pattern involving a unary op +define i32 @test3(float %x) nounwind { +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movw $3199, {{[0-9]+}}(%esp) # imm = 0xC7F +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: shrl $8, %eax +; CHECK-NEXT: andl $65280, %ecx # imm = 0xFF00 +; CHECK-NEXT: andl $-16777216, %edx # imm = 0xFF000000 +; CHECK-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: retl +; +; CHECK64-LABEL: test3: +; CHECK64: # BB#0: +; CHECK64-NEXT: cvttss2si %xmm0, %ecx +; CHECK64-NEXT: movl %ecx, %edx +; CHECK64-NEXT: shll $8, %edx +; CHECK64-NEXT: movl %ecx, %eax +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: andl $65280, %ecx # imm = 0xFF00 +; CHECK64-NEXT: andl $-16777216, %edx # imm = 0xFF000000 +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: retq + %integer = fptosi float %x to i32 + %byte0 = shl i32 %integer, 8 + %byte3 = shl i32 %integer, 8 + %byte2 = lshr i32 %integer, 8 + %tmp1 = and i32 %integer, 65280 ; 0x0000ff00 + %tmp0 = and i32 %byte0, 255 ; 0x000000ff + %tmp3 = and i32 %byte3, 4278190080 ; 0xff000000 + %tmp2 = and i32 %byte2, 16711680 ; 0x00ff0000 + %or0 = or i32 %tmp0, %tmp1 + %or1 = or i32 %tmp2, %tmp3 + %result = or i32 %or0, %or1 + ret i32 %result +} diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index e4cf296432ba..d7f52d247988 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -430,6 +430,7 @@ define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) { ret <4 x i32> %or } +; TODO: Why would we do this? ; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) define <2 x i64> @or_and_v2i64(<2 x i64> %a0) { @@ -438,16 +439,17 @@ define <2 x i64> @or_and_v2i64(<2 x i64> %a0) { ; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: orps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq - %1 = and <2 x i64> %a0, + %1 = and <2 x i64> %a0, %2 = or <2 x i64> %1, ret <2 x i64> %2 } +; If all masked bits are going to be set, that's a constant fold. + define <4 x i32> @or_and_v4i32(<4 x i32> %a0) { ; CHECK-LABEL: or_and_v4i32: ; CHECK: # BB#0: -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: orps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: retq %1 = and <4 x i32> %a0, %2 = or <4 x i32> %1, @@ -459,9 +461,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) { define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) { ; CHECK-LABEL: or_zext_v2i32: ; CHECK: # BB#0: -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295] ; CHECK-NEXT: retq %1 = zext <2 x i32> %a0 to <2 x i64> %2 = or <2 x i64> %1, @@ -471,9 +471,7 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) { define <4 x i32> @or_zext_v4i16(<4 x i16> %a0) { ; CHECK-LABEL: or_zext_v4i16: ; CHECK: # BB#0: -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65535] ; CHECK-NEXT: retq %1 = zext <4 x i16> %a0 to <4 x i32> %2 = or <4 x i32> %1, diff --git a/test/CodeGen/X86/dbg-baseptr.ll b/test/CodeGen/X86/dbg-baseptr.ll new file mode 100644 index 000000000000..f69c78af7367 --- /dev/null +++ b/test/CodeGen/X86/dbg-baseptr.ll @@ -0,0 +1,75 @@ +; RUN: llc -o - %s | FileCheck %s +; This test checks that parameters on the stack pointer are correctly +; referenced by debug info. +target triple = "x86_64--" + +@glob = external global i64 +@ptr = external global i32* +%struct.s = type { i32, i32, i32, i32, i32 } + +; CHECK-LABEL: f0: +; CHECK: DEBUG_VALUE: f:input <- [%RSP+8] +define i32 @f0(%struct.s* byval align 8 %input) !dbg !8 { + call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !4, metadata !17), !dbg !18 + ret i32 42 +} + +; CHECK-LABEL: f1: +; CHECK: DEBUG_VALUE: f:input <- [%RBP+16] +define i32 @f1(%struct.s* byval align 8 %input) !dbg !8 { + %val = load i64, i64* @glob + ; this alloca should force FP usage. + %stackspace = alloca i32, i64 %val, align 1 + store i32* %stackspace, i32** @ptr + call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !4, metadata !17), !dbg !18 + ret i32 42 +} + +; CHECK-LABEL: f2: +; Just check that we are indeed aligning the stack and setting up a base pointer +; in RBX. +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: pushq %rbx +; CHECK: andq $-64, %rsp +; CHECK: subq $64, %rsp +; CHECK: movq %rsp, %rbx +; The parameter should still be referenced through RBP though. +; CHECK-NOT: DEBUG_VALUE: f:input <- [%RBX +; CHECK: DEBUG_VALUE: f:input <- [%RBP+16] +define i32 @f2(%struct.s* byval align 8 %input) !dbg !8 { + %val = load i64, i64* @glob + %stackspace = alloca i32, i64 %val, align 64 + store i32* %stackspace, i32** @ptr + call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !4, metadata !17), !dbg !18 + ret i32 42 +} + +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 2, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3) +!3 = !DIFile(filename: "dbg-baseptr.ll", directory: "/") +!4 = !DILocalVariable(name: "input", arg: 1, scope: !8, file: !3, line: 5, type: !9) +!5 = !{} + +!6 = !DISubroutineType(types: !7) +!7 = !{!10, !9} + +!8 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) + +!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", elements: !11) +!10 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!11 = !{!12, !13, !14, !15, !16} +!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", baseType: !10, size: 32) +!13 = !DIDerivedType(tag: DW_TAG_member, name: "b", baseType: !10, size: 32, offset: 32) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "c", baseType: !10, size: 32, offset: 64) +!15 = !DIDerivedType(tag: DW_TAG_member, name: "d", baseType: !10, size: 32, offset: 96) +!16 = !DIDerivedType(tag: DW_TAG_member, name: "e", baseType: !10, size: 32, offset: 128) + +!17 = !DIExpression() +!18 = !DILocation(line: 5, scope: !8) diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll index 1751f03731d3..5286a1b635d1 100644 --- a/test/CodeGen/X86/extract-store.ll +++ b/test/CodeGen/X86/extract-store.ll @@ -5,6 +5,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 --check-prefix=SSE-X64 --check-prefix=SSE41-X64 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=AVX-X32 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=AVX-X64 +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking \ +; RUN: | FileCheck %s --check-prefix=X64 --check-prefix=SSE-X64 --check-prefix=SSE-F128 +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx -enable-legalize-types-checking \ +; RUN: | FileCheck %s --check-prefix=X64 --check-prefix=SSE-X64 --check-prefix=SSE-F128 define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind { ; SSE2-X32-LABEL: extract_i8_0: @@ -458,6 +462,26 @@ define void @extract_f64_1(double* nocapture %dst, <2 x double> %foo) nounwind { ret void } +define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind { +; SSE-F128-LABEL: extract_f128_0: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: movaps %xmm0, (%rdi) +; SSE-F128-NEXT: retq + %vecext = extractelement <2 x fp128> %foo, i32 0 + store fp128 %vecext, fp128* %dst, align 1 + ret void +} + +define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind { +; SSE-F128-LABEL: extract_f128_1: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: movaps %xmm1, (%rdi) +; SSE-F128-NEXT: retq + %vecext = extractelement <2 x fp128> %foo, i32 1 + store fp128 %vecext, fp128* %dst, align 1 + ret void +} + define void @extract_i8_undef(i8* nocapture %dst, <16 x i8> %foo) nounwind { ; X32-LABEL: extract_i8_undef: ; X32: # BB#0: @@ -535,3 +559,16 @@ define void @extract_f64_undef(double* nocapture %dst, <2 x double> %foo) nounwi store double %vecext, double* %dst, align 1 ret void } + +define void @extract_f128_undef(fp128* nocapture %dst, <2 x fp128> %foo) nounwind { +; X32-LABEL: extract_f128_undef: +; X32: # BB#0: +; X32-NEXT: retl +; +; X64-LABEL: extract_f128_undef: +; X64: # BB#0: +; X64-NEXT: retq + %vecext = extractelement <2 x fp128> %foo, i32 2 ; undef + store fp128 %vecext, fp128* %dst, align 1 + ret void +} diff --git a/test/CodeGen/X86/fp128-extract.ll b/test/CodeGen/X86/fp128-extract.ll new file mode 100644 index 000000000000..5006ac898c71 --- /dev/null +++ b/test/CodeGen/X86/fp128-extract.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \ +; RUN: -enable-legalize-types-checking | FileCheck %s +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \ +; RUN: -enable-legalize-types-checking | FileCheck %s + +; Test the softened result of extractelement op code. +define fp128 @TestExtract(<2 x double> %x) { +entry: + ; Simplified instruction pattern from the output of llvm before r289042, + ; for a boost function ...::insert<...>::traverse<...>(). + %a = fpext <2 x double> %x to <2 x fp128> + %0 = extractelement <2 x fp128> %a, i32 0 + %1 = extractelement <2 x fp128> %a, i32 1 + %2 = fmul fp128 %0, %1 + ret fp128 %2 +; CHECK-LABEL: TestExtract: +; CHECK: movaps %xmm0, (%rsp) +; CHECK: callq __extenddftf2 +; CHECK: callq __extenddftf2 +; CHECK: callq __multf3 +; CHECK: retq +} diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll index da92bdb55d7c..3da1a360e290 100644 --- a/test/CodeGen/X86/i64-to-float.ll +++ b/test/CodeGen/X86/i64-to-float.ll @@ -224,36 +224,32 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE-NEXT: pxor %xmm1, %xmm2 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551361,18446744073709551361] -; X64-SSE-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE-NEXT: pxor %xmm3, %xmm4 -; X64-SSE-NEXT: movdqa %xmm4, %xmm5 -; X64-SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm4 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; X64-SSE-NEXT: pand %xmm6, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; X64-SSE-NEXT: por %xmm2, %xmm4 -; X64-SSE-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067713,18446744071562067713] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE-NEXT: pand %xmm5, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE-NEXT: por %xmm2, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm2 ; X64-SSE-NEXT: pandn %xmm0, %xmm2 -; X64-SSE-NEXT: pand %xmm3, %xmm4 -; X64-SSE-NEXT: por %xmm2, %xmm4 -; X64-SSE-NEXT: movdqa %xmm4, %xmm0 -; X64-SSE-NEXT: pxor %xmm1, %xmm0 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; X64-SSE-NEXT: pxor %xmm2, %xmm1 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: pcmpgtd %xmm1, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm3 +; X64-SSE-NEXT: por %xmm2, %xmm3 +; X64-SSE-NEXT: pxor %xmm3, %xmm1 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pand %xmm5, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; X64-SSE-NEXT: pand %xmm4, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X64-SSE-NEXT: por %xmm0, %xmm1 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE-NEXT: pandn %xmm4, %xmm0 -; X64-SSE-NEXT: pand %xmm2, %xmm1 +; X64-SSE-NEXT: pandn %xmm3, %xmm0 +; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; X64-SSE-NEXT: por %xmm0, %xmm1 ; X64-SSE-NEXT: movd %xmm1, %rax ; X64-SSE-NEXT: xorps %xmm0, %xmm0 diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll index cea9ac26edbc..4c3c8bbd793e 100644 --- a/test/CodeGen/X86/known-signbits-vector.ll +++ b/test/CodeGen/X86/known-signbits-vector.ll @@ -100,21 +100,27 @@ define float @signbits_ashr_extract_sitofp(<2 x i64> %a0) nounwind { define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwind { ; X32-LABEL: signbits_ashr_insert_ashr_extract_sitofp: ; X32: # BB#0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $16, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl 12(%ebp), %ecx ; X32-NEXT: shrdl $30, %ecx, %eax ; X32-NEXT: sarl $30, %ecx ; X32-NEXT: vmovd %eax, %xmm0 ; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, 16(%ebp), %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, 20(%ebp), %xmm0, %xmm0 +; X32-NEXT: vpsrad $3, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $3, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 -; X32-NEXT: vmovss %xmm0, (%esp) -; X32-NEXT: flds (%esp) -; X32-NEXT: popl %eax +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: fildll {{[0-9]+}}(%esp) +; X32-NEXT: fstps {{[0-9]+}}(%esp) +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp ; X32-NEXT: retl ; ; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp: @@ -127,7 +133,7 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin ; X64-NEXT: vpsrlq $3, %xmm0, %xmm0 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; X64-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; X64-NEXT: retq %1 = ashr i64 %a0, 30 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index fdc5ace8d9bc..d332b2f3169f 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -1,27 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 -;SSE2-label: @_Z10test_shortPsS_i -;SSE2: movdqu -;SSE2-NEXT: movdqu -;SSE2-NEXT: pmaddwd -;SSE2-NEXT: paddd - -;AVX2-label: @_Z10test_shortPsS_i -;AVX2: vmovdqu -;AVX2-NEXT: vpmaddwd -;AVX2-NEXT: vinserti128 -;AVX2-NEXT: vpaddd - -;AVX512-label: @_Z10test_shortPsS_i -;AVX512: vmovdqu -;AVX512-NEXT: vpmaddwd -;AVX512-NEXT: vinserti128 -;AVX512-NEXT: vpaddd - define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { +; SSE2-LABEL: _Z10test_shortPsS_i: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB0_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movdqu (%rdi), %xmm2 +; SSE2-NEXT: movdqu (%rsi), %xmm3 +; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: addq $16, %rsi +; SSE2-NEXT: addq $16, %rdi +; SSE2-NEXT: addq $-8, %rax +; SSE2-NEXT: jne .LBB0_1 +; SSE2-NEXT: # BB#2: # %middle.block +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: _Z10test_shortPsS_i: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB0_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vmovdqu (%rsi), %xmm2 +; AVX2-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: addq $16, %rsi +; AVX2-NEXT: addq $16, %rdi +; AVX2-NEXT: addq $-8, %rax +; AVX2-NEXT: jne .LBB0_1 +; AVX2-NEXT: # BB#2: # %middle.block +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: _Z10test_shortPsS_i: +; AVX512: # BB#0: # %entry +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB0_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vmovdqu (%rsi), %xmm2 +; AVX512-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: addq $16, %rsi +; AVX512-NEXT: addq $16, %rdi +; AVX512-NEXT: addq $-8, %rax +; AVX512-NEXT: jne .LBB0_1 +; AVX512-NEXT: # BB#2: # %middle.block +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -54,20 +113,227 @@ middle.block: ret i32 %13 } -;AVX2-label: @_Z9test_charPcS_i -;AVX2: vpmovsxbw -;AVX2-NEXT: vpmovsxbw -;AVX2-NEXT: vpmaddwd -;AVX2-NEXT: vpaddd +define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { +; SSE2-LABEL: test_unsigned_short: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB1_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movdqu (%rdi), %xmm2 +; SSE2-NEXT: movdqu (%rsi), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pmulhuw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: addq $16, %rsi +; SSE2-NEXT: addq $16, %rdi +; SSE2-NEXT: addq $-8, %rax +; SSE2-NEXT: jne .LBB1_1 +; SSE2-NEXT: # BB#2: # %middle.block +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: test_unsigned_short: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB1_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: addq $16, %rsi +; AVX2-NEXT: addq $16, %rdi +; AVX2-NEXT: addq $-8, %rax +; AVX2-NEXT: jne .LBB1_1 +; AVX2-NEXT: # BB#2: # %middle.block +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_unsigned_short: +; AVX512: # BB#0: # %entry +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB1_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $16, %rsi +; AVX512-NEXT: addq $16, %rdi +; AVX512-NEXT: addq $-8, %rax +; AVX512-NEXT: jne .LBB1_1 +; AVX512-NEXT: # BB#2: # %middle.block +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %3 = zext i32 %2 to i64 + br label %vector.body -;AVX512-label: @_Z9test_charPcS_i -;AVX512: vpmovsxbw -;AVX512-NEXT: vpmovsxbw -;AVX512-NEXT: vpmaddwd -;AVX512-NEXT: vinserti64x4 -;AVX512-NEXT: vpaddd +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i16, i16* %0, i64 %index + %5 = bitcast i16* %4 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %5, align 2 + %6 = zext <8 x i16> %wide.load to <8 x i32> + %7 = getelementptr inbounds i16, i16* %1, i64 %index + %8 = bitcast i16* %7 to <8 x i16>* + %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2 + %9 = zext <8 x i16> %wide.load14 to <8 x i32> + %10 = mul nsw <8 x i32> %9, %6 + %11 = add nsw <8 x i32> %10, %vec.phi + %index.next = add i64 %index, 8 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %11, %rdx.shuf + %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> + %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 + %13 = extractelement <8 x i32> %bin.rdx18, i32 0 + ret i32 %13 +} define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 { +; SSE2-LABEL: _Z9test_charPcS_i: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB2_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm6 +; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: pmullw %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: addq $16, %rsi +; SSE2-NEXT: addq $16, %rdi +; SSE2-NEXT: addq $-16, %rax +; SSE2-NEXT: jne .LBB2_1 +; SSE2-NEXT: # BB#2: # %middle.block +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; AVX2-LABEL: _Z9test_charPcS_i: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB2_1: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vpmovsxbw (%rdi), %ymm2 +; AVX2-NEXT: vpmovsxbw (%rsi), %ymm3 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: addq $16, %rsi +; AVX2-NEXT: addq $16, %rdi +; AVX2-NEXT: addq $-16, %rax +; AVX2-NEXT: jne .LBB2_1 +; AVX2-NEXT: # BB#2: # %middle.block +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: _Z9test_charPcS_i: +; AVX512: # BB#0: # %entry +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB2_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vpmovsxbw (%rdi), %ymm2 +; AVX512-NEXT: vpmovsxbw (%rsi), %ymm3 +; AVX512-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 +; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: addq $16, %rsi +; AVX512-NEXT: addq $16, %rdi +; AVX512-NEXT: addq $-16, %rax +; AVX512-NEXT: jne .LBB2_1 +; AVX512-NEXT: # BB#2: # %middle.block +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll index 31c1f6582426..dcb7bd010e56 100644 --- a/test/CodeGen/X86/merge_store.ll +++ b/test/CodeGen/X86/merge_store.ll @@ -28,3 +28,34 @@ entry: for.end: ret void } + + + +;; CHECK-LABEL: indexed-store-merge + +;; We should be able to merge the 4 consecutive stores. +;; FIXMECHECK: movl $0, 2(%rsi,%rdi) + +;; CHECK: movb $0, 2(%rsi,%rdi) +;; CHECK: movb $0, 3(%rsi,%rdi) +;; CHECK: movb $0, 4(%rsi,%rdi) +;; CHECK: movb $0, 5(%rsi,%rdi) +;; CHECK: movb $0, (%rsi) +define void @indexed-store-merge(i64 %p, i8* %v) { +entry: + %p2 = add nsw i64 %p, 2 + %v2 = getelementptr i8, i8* %v, i64 %p2 + store i8 0, i8* %v2, align 2 + %p3 = add nsw i64 %p, 3 + %v3 = getelementptr i8, i8* %v, i64 %p3 + store i8 0, i8* %v3, align 1 + %p4 = add nsw i64 %p, 4 + %v4 = getelementptr i8, i8* %v, i64 %p4 + store i8 0, i8* %v4, align 2 + %p5 = add nsw i64 %p, 5 + %v5 = getelementptr i8, i8* %v, i64 %p5 + store i8 0, i8* %v5, align 1 + %v0 = getelementptr i8, i8* %v, i64 0 + store i8 0, i8* %v0, align 2 + ret void +} diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll new file mode 100644 index 000000000000..52e6b61aedfe --- /dev/null +++ b/test/CodeGen/X86/sse-schedule.ll @@ -0,0 +1,2415 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_addps: +; GENERIC: # BB#0: +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: addps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addps: +; ATOM: # BB#0: +; ATOM-NEXT: addps %xmm1, %xmm0 +; ATOM-NEXT: addps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addps: +; SLM: # BB#0: +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd <4 x float> %a0, %a1 + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = fadd <4 x float> %1, %2 + ret <4 x float> %3 +} + +define float @test_addss(float %a0, float %a1, float *%a2) { +; GENERIC-LABEL: test_addss: +; GENERIC: # BB#0: +; GENERIC-NEXT: addss %xmm1, %xmm0 +; GENERIC-NEXT: addss (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addss: +; ATOM: # BB#0: +; ATOM-NEXT: addss %xmm1, %xmm0 +; ATOM-NEXT: addss (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addss: +; SLM: # BB#0: +; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addss (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addss: +; SANDY: # BB#0: +; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd float %a0, %a1 + %2 = load float, float *%a2, align 4 + %3 = fadd float %1, %2 + ret float %3 +} + +define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_andps: +; GENERIC: # BB#0: +; GENERIC-NEXT: andps %xmm1, %xmm0 +; GENERIC-NEXT: andps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_andps: +; ATOM: # BB#0: +; ATOM-NEXT: andps %xmm1, %xmm0 +; ATOM-NEXT: andps (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_andps: +; SLM: # BB#0: +; SLM-NEXT: andps %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: andps (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_andps: +; SANDY: # BB#0: +; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x float> %a0 to <4 x i32> + %2 = bitcast <4 x float> %a1 to <4 x i32> + %3 = and <4 x i32> %1, %2 + %4 = load <4 x float>, <4 x float> *%a2, align 16 + %5 = bitcast <4 x float> %4 to <4 x i32> + %6 = and <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <4 x float> + ret <4 x float> %7 +} + +define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_andnotps: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnps %xmm1, %xmm0 +; GENERIC-NEXT: andnps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_andnotps: +; ATOM: # BB#0: +; ATOM-NEXT: andnps %xmm1, %xmm0 +; ATOM-NEXT: andnps (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_andnotps: +; SLM: # BB#0: +; SLM-NEXT: andnps %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: andnps (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_andnotps: +; SANDY: # BB#0: +; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andnotps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andnotps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x float> %a0 to <4 x i32> + %2 = bitcast <4 x float> %a1 to <4 x i32> + %3 = xor <4 x i32> %1, + %4 = and <4 x i32> %3, %2 + %5 = load <4 x float>, <4 x float> *%a2, align 16 + %6 = bitcast <4 x float> %5 to <4 x i32> + %7 = xor <4 x i32> %4, + %8 = and <4 x i32> %6, %7 + %9 = bitcast <4 x i32> %8 to <4 x float> + ret <4 x float> %9 +} + +define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_cmpps: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpeqps %xmm0, %xmm1 +; GENERIC-NEXT: cmpeqps (%rdi), %xmm0 +; GENERIC-NEXT: orps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cmpps: +; ATOM: # BB#0: +; ATOM-NEXT: cmpeqps %xmm0, %xmm1 +; ATOM-NEXT: cmpeqps (%rdi), %xmm0 +; ATOM-NEXT: orps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cmpps: +; SLM: # BB#0: +; SLM-NEXT: cmpeqps %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: cmpeqps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: orps %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cmpps: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fcmp oeq <4 x float> %a0, %a1 + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = fcmp oeq <4 x float> %a0, %2 + %4 = or <4 x i1> %1, %3 + %5 = sext <4 x i1> %4 to <4 x i32> + %6 = bitcast <4 x i32> %5 to <4 x float> + ret <4 x float> %6 +} + +define float @test_cmpss(float %a0, float %a1, float *%a2) { +; GENERIC-LABEL: test_cmpss: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpeqss %xmm1, %xmm0 +; GENERIC-NEXT: cmpeqss (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cmpss: +; ATOM: # BB#0: +; ATOM-NEXT: cmpeqss %xmm1, %xmm0 +; ATOM-NEXT: cmpeqss (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cmpss: +; SLM: # BB#0: +; SLM-NEXT: cmpeqss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: cmpeqss (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cmpss: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmpss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmpss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x float> undef, float %a0, i32 0 + %2 = insertelement <4 x float> undef, float %a1, i32 0 + %3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0) + %4 = load float, float *%a2, align 4 + %5 = insertelement <4 x float> undef, float %4, i32 0 + %6 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %3, <4 x float> %5, i8 0) + %7 = extractelement <4 x float> %6, i32 0 + ret float %7 +} +declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone + +define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_comiss: +; GENERIC: # BB#0: +; GENERIC-NEXT: comiss %xmm1, %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %cl +; GENERIC-NEXT: andb %al, %cl +; GENERIC-NEXT: comiss (%rdi), %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %dl +; GENERIC-NEXT: andb %al, %dl +; GENERIC-NEXT: orb %cl, %dl +; GENERIC-NEXT: movzbl %dl, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_comiss: +; ATOM: # BB#0: +; ATOM-NEXT: comiss %xmm1, %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %cl +; ATOM-NEXT: andb %al, %cl +; ATOM-NEXT: comiss (%rdi), %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %dl +; ATOM-NEXT: andb %al, %dl +; ATOM-NEXT: orb %cl, %dl +; ATOM-NEXT: movzbl %dl, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_comiss: +; SLM: # BB#0: +; SLM-NEXT: comiss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %cl # sched: [1:0.50] +; SLM-NEXT: andb %al, %cl # sched: [1:0.50] +; SLM-NEXT: comiss (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %dl # sched: [1:0.50] +; SLM-NEXT: andb %al, %dl # sched: [1:0.50] +; SLM-NEXT: orb %cl, %dl # sched: [1:0.50] +; SLM-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_comiss: +; SANDY: # BB#0: +; SANDY-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] +; SANDY-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] +; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] +; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_comiss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %cl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] +; HASWELL-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %dl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] +; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] +; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_comiss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %cl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] +; BTVER2-NEXT: vcomiss (%rdi), %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %dl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %dl # sched: [1:0.50] +; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] +; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 4 + %3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2) + %4 = or i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone + +define float @test_cvtsi2ss(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_cvtsi2ss: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsi2ssl %edi, %xmm1 +; GENERIC-NEXT: cvtsi2ssl (%rsi), %xmm0 +; GENERIC-NEXT: addss %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsi2ss: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsi2ssl (%rsi), %xmm0 +; ATOM-NEXT: cvtsi2ssl %edi, %xmm1 +; ATOM-NEXT: addss %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsi2ss: +; SLM: # BB#0: +; SLM-NEXT: cvtsi2ssl (%rsi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: cvtsi2ssl %edi, %xmm1 # sched: [4:0.50] +; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsi2ss: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsi2ss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsi2ss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp i32 %a0 to float + %2 = load i32, i32 *%a1, align 4 + %3 = sitofp i32 %2 to float + %4 = fadd float %1, %3 + ret float %4 +} + +define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_cvtsi2ssq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsi2ssq %rdi, %xmm1 +; GENERIC-NEXT: cvtsi2ssq (%rsi), %xmm0 +; GENERIC-NEXT: addss %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsi2ssq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsi2ssq (%rsi), %xmm0 +; ATOM-NEXT: cvtsi2ssq %rdi, %xmm1 +; ATOM-NEXT: addss %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsi2ssq: +; SLM: # BB#0: +; SLM-NEXT: cvtsi2ssq (%rsi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: cvtsi2ssq %rdi, %xmm1 # sched: [4:0.50] +; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsi2ssq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsi2ssq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsi2ssq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp i64 %a0 to float + %2 = load i64, i64 *%a1, align 8 + %3 = sitofp i64 %2 to float + %4 = fadd float %1, %3 + ret float %4 +} + +define i32 @test_cvtss2si(float %a0, float *%a1) { +; GENERIC-LABEL: test_cvtss2si: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtss2si %xmm0, %ecx +; GENERIC-NEXT: cvtss2si (%rdi), %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtss2si: +; ATOM: # BB#0: +; ATOM-NEXT: cvtss2si (%rdi), %eax +; ATOM-NEXT: cvtss2si %xmm0, %ecx +; ATOM-NEXT: addl %ecx, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtss2si: +; SLM: # BB#0: +; SLM-NEXT: cvtss2si (%rdi), %eax # sched: [7:1.00] +; SLM-NEXT: cvtss2si %xmm0, %ecx # sched: [4:0.50] +; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtss2si: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00] +; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtss2si: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtss2si %xmm0, %ecx # sched: [4:1.00] +; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtss2si: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtss2si (%rdi), %eax # sched: [8:1.00] +; BTVER2-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x float> undef, float %a0, i32 0 + %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1) + %3 = load float, float *%a1, align 4 + %4 = insertelement <4 x float> undef, float %3, i32 0 + %5 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %4) + %6 = add i32 %2, %5 + ret i32 %6 +} +declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone + +define i64 @test_cvtss2siq(float %a0, float *%a1) { +; GENERIC-LABEL: test_cvtss2siq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtss2si %xmm0, %rcx +; GENERIC-NEXT: cvtss2si (%rdi), %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtss2siq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtss2si (%rdi), %rax +; ATOM-NEXT: cvtss2si %xmm0, %rcx +; ATOM-NEXT: addq %rcx, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtss2siq: +; SLM: # BB#0: +; SLM-NEXT: cvtss2si (%rdi), %rax # sched: [7:1.00] +; SLM-NEXT: cvtss2si %xmm0, %rcx # sched: [4:0.50] +; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtss2siq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00] +; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtss2siq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtss2si %xmm0, %rcx # sched: [4:1.00] +; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtss2siq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtss2si (%rdi), %rax # sched: [8:1.00] +; BTVER2-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x float> undef, float %a0, i32 0 + %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1) + %3 = load float, float *%a1, align 4 + %4 = insertelement <4 x float> undef, float %3, i32 0 + %5 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %4) + %6 = add i64 %2, %5 + ret i64 %6 +} +declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone + +define i32 @test_cvttss2si(float %a0, float *%a1) { +; GENERIC-LABEL: test_cvttss2si: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttss2si %xmm0, %ecx +; GENERIC-NEXT: cvttss2si (%rdi), %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttss2si: +; ATOM: # BB#0: +; ATOM-NEXT: cvttss2si (%rdi), %eax +; ATOM-NEXT: cvttss2si %xmm0, %ecx +; ATOM-NEXT: addl %ecx, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttss2si: +; SLM: # BB#0: +; SLM-NEXT: cvttss2si (%rdi), %eax # sched: [7:1.00] +; SLM-NEXT: cvttss2si %xmm0, %ecx # sched: [4:0.50] +; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttss2si: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00] +; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttss2si: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttss2si %xmm0, %ecx # sched: [4:1.00] +; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttss2si: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttss2si (%rdi), %eax # sched: [8:1.00] +; BTVER2-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi float %a0 to i32 + %2 = load float, float *%a1, align 4 + %3 = fptosi float %2 to i32 + %4 = add i32 %1, %3 + ret i32 %4 +} + +define i64 @test_cvttss2siq(float %a0, float *%a1) { +; GENERIC-LABEL: test_cvttss2siq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttss2si %xmm0, %rcx +; GENERIC-NEXT: cvttss2si (%rdi), %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttss2siq: +; ATOM: # BB#0: +; ATOM-NEXT: cvttss2si (%rdi), %rax +; ATOM-NEXT: cvttss2si %xmm0, %rcx +; ATOM-NEXT: addq %rcx, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttss2siq: +; SLM: # BB#0: +; SLM-NEXT: cvttss2si (%rdi), %rax # sched: [7:1.00] +; SLM-NEXT: cvttss2si %xmm0, %rcx # sched: [4:0.50] +; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttss2siq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00] +; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttss2siq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttss2si %xmm0, %rcx # sched: [4:1.00] +; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttss2siq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttss2si (%rdi), %rax # sched: [8:1.00] +; BTVER2-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi float %a0 to i64 + %2 = load float, float *%a1, align 4 + %3 = fptosi float %2 to i64 + %4 = add i64 %1, %3 + ret i64 %4 +} + +define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_divps: +; GENERIC: # BB#0: +; GENERIC-NEXT: divps %xmm1, %xmm0 +; GENERIC-NEXT: divps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_divps: +; ATOM: # BB#0: +; ATOM-NEXT: divps %xmm1, %xmm0 +; ATOM-NEXT: divps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_divps: +; SLM: # BB#0: +; SLM-NEXT: divps %xmm1, %xmm0 # sched: [34:34.00] +; SLM-NEXT: divps (%rdi), %xmm0 # sched: [37:34.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_divps: +; SANDY: # BB#0: +; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fdiv <4 x float> %a0, %a1 + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = fdiv <4 x float> %1, %2 + ret <4 x float> %3 +} + +define float @test_divss(float %a0, float %a1, float *%a2) { +; GENERIC-LABEL: test_divss: +; GENERIC: # BB#0: +; GENERIC-NEXT: divss %xmm1, %xmm0 +; GENERIC-NEXT: divss (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_divss: +; ATOM: # BB#0: +; ATOM-NEXT: divss %xmm1, %xmm0 +; ATOM-NEXT: divss (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_divss: +; SLM: # BB#0: +; SLM-NEXT: divss %xmm1, %xmm0 # sched: [34:34.00] +; SLM-NEXT: divss (%rdi), %xmm0 # sched: [37:34.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_divss: +; SANDY: # BB#0: +; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fdiv float %a0, %a1 + %2 = load float, float *%a2, align 4 + %3 = fdiv float %1, %2 + ret float %3 +} + +define void @test_ldmxcsr(i32 %a0) { +; GENERIC-LABEL: test_ldmxcsr: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; GENERIC-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_ldmxcsr: +; ATOM: # BB#0: +; ATOM-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; ATOM-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_ldmxcsr: +; SLM: # BB#0: +; SLM-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; SLM-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ldmxcsr: +; SANDY: # BB#0: +; SANDY-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_ldmxcsr: +; HASWELL: # BB#0: +; HASWELL-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [6:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ldmxcsr: +; BTVER2: # BB#0: +; BTVER2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; BTVER2-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + store i32 %a0, i32* %1 + call void @llvm.x86.sse.ldmxcsr(i8* %2) + ret void +} +declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone + +define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_maxps: +; GENERIC: # BB#0: +; GENERIC-NEXT: maxps %xmm1, %xmm0 +; GENERIC-NEXT: maxps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maxps: +; ATOM: # BB#0: +; ATOM-NEXT: maxps %xmm1, %xmm0 +; ATOM-NEXT: maxps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maxps: +; SLM: # BB#0: +; SLM-NEXT: maxps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: maxps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maxps: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_maxss: +; GENERIC: # BB#0: +; GENERIC-NEXT: maxss %xmm1, %xmm0 +; GENERIC-NEXT: maxss (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maxss: +; ATOM: # BB#0: +; ATOM-NEXT: maxss %xmm1, %xmm0 +; ATOM-NEXT: maxss (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maxss: +; SLM: # BB#0: +; SLM-NEXT: maxss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: maxss (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maxss: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_minps: +; GENERIC: # BB#0: +; GENERIC-NEXT: minps %xmm1, %xmm0 +; GENERIC-NEXT: minps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_minps: +; ATOM: # BB#0: +; ATOM-NEXT: minps %xmm1, %xmm0 +; ATOM-NEXT: minps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_minps: +; SLM: # BB#0: +; SLM-NEXT: minps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: minps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_minps: +; SANDY: # BB#0: +; SANDY-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_minss: +; GENERIC: # BB#0: +; GENERIC-NEXT: minss %xmm1, %xmm0 +; GENERIC-NEXT: minss (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_minss: +; ATOM: # BB#0: +; ATOM-NEXT: minss %xmm1, %xmm0 +; ATOM-NEXT: minss (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_minss: +; SLM: # BB#0: +; SLM-NEXT: minss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: minss (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_minss: +; SANDY: # BB#0: +; SANDY-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone + +define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_movaps: +; GENERIC: # BB#0: +; GENERIC-NEXT: movaps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm0, %xmm0 +; GENERIC-NEXT: movaps %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movaps: +; ATOM: # BB#0: +; ATOM-NEXT: movaps (%rdi), %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm0 +; ATOM-NEXT: movaps %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movaps: +; SLM: # BB#0: +; SLM-NEXT: movaps (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: addps %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movaps %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movaps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movaps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movaps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <4 x float>, <4 x float> *%a0, align 16 + %2 = fadd <4 x float> %1, %1 + store <4 x float> %2, <4 x float> *%a1, align 16 + ret void +} + +; TODO (v)movhlps + +define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) { +; GENERIC-LABEL: test_movhlps: +; GENERIC: # BB#0: +; GENERIC-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movhlps: +; ATOM: # BB#0: +; ATOM-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movhlps: +; SLM: # BB#0: +; SLM-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movhlps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movhlps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movhlps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> + ret <4 x float> %1 +} + +; TODO (v)movhps + +define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { +; GENERIC-LABEL: test_movhps: +; GENERIC: # BB#0: +; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; GENERIC-NEXT: addps %xmm0, %xmm1 +; GENERIC-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; GENERIC-NEXT: movlps %xmm1, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movhps: +; ATOM: # BB#0: +; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; ATOM-NEXT: movlps %xmm1, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movhps: +; SLM: # BB#0: +; SLM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] +; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: pextrq $1, %xmm1, (%rdi) # sched: [4:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movhps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movhps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movhps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast x86_mmx* %a2 to <2 x float>* + %2 = load <2 x float>, <2 x float> *%1, align 8 + %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> + %4 = shufflevector <4 x float> %a1, <4 x float> %3, <4 x i32> + %5 = fadd <4 x float> %a0, %4 + %6 = shufflevector <4 x float> %5, <4 x float> undef, <2 x i32> + store <2 x float> %6, <2 x float>* %1 + ret void +} + +; TODO (v)movlhps + +define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) { +; GENERIC-LABEL: test_movlhps: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movlhps: +; ATOM: # BB#0: +; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ATOM-NEXT: addps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movlhps: +; SLM: # BB#0: +; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movlhps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movlhps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movlhps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> + %2 = fadd <4 x float> %a1, %1 + ret <4 x float> %2 +} + +define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { +; GENERIC-LABEL: test_movlps: +; GENERIC: # BB#0: +; GENERIC-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; GENERIC-NEXT: addps %xmm0, %xmm1 +; GENERIC-NEXT: movlps %xmm1, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movlps: +; ATOM: # BB#0: +; ATOM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movlps %xmm1, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movlps: +; SLM: # BB#0: +; SLM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [4:1.00] +; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movlps %xmm1, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movlps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movlps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movlps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast x86_mmx* %a2 to <2 x float>* + %2 = load <2 x float>, <2 x float> *%1, align 8 + %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> + %4 = shufflevector <4 x float> %a1, <4 x float> %3, <4 x i32> + %5 = fadd <4 x float> %a0, %4 + %6 = shufflevector <4 x float> %5, <4 x float> undef, <2 x i32> + store <2 x float> %6, <2 x float>* %1 + ret void +} + +define i32 @test_movmskps(<4 x float> %a0) { +; GENERIC-LABEL: test_movmskps: +; GENERIC: # BB#0: +; GENERIC-NEXT: movmskps %xmm0, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movmskps: +; ATOM: # BB#0: +; ATOM-NEXT: movmskps %xmm0, %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movmskps: +; SLM: # BB#0: +; SLM-NEXT: movmskps %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movmskps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movmskps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movmskps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone + +define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_movntps: +; GENERIC: # BB#0: +; GENERIC-NEXT: movntps %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movntps: +; ATOM: # BB#0: +; ATOM-NEXT: movntps %xmm0, (%rdi) +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movntps: +; SLM: # BB#0: +; SLM-NEXT: movntps %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movntps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0 + ret void +} + +define void @test_movss_mem(float* %a0, float* %a1) { +; GENERIC-LABEL: test_movss_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; GENERIC-NEXT: addss %xmm0, %xmm0 +; GENERIC-NEXT: movss %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movss_mem: +; ATOM: # BB#0: +; ATOM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ATOM-NEXT: addss %xmm0, %xmm0 +; ATOM-NEXT: movss %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movss_mem: +; SLM: # BB#0: +; SLM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [3:1.00] +; SLM-NEXT: addss %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movss %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movss_mem: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movss_mem: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movss_mem: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load float, float* %a0, align 1 + %2 = fadd float %1, %1 + store float %2, float *%a1, align 1 + ret void +} + +define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) { +; GENERIC-LABEL: test_movss_reg: +; GENERIC: # BB#0: +; GENERIC-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movss_reg: +; ATOM: # BB#0: +; ATOM-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movss_reg: +; SLM: # BB#0: +; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movss_reg: +; SANDY: # BB#0: +; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movss_reg: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movss_reg: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> + ret <4 x float> %1 +} + +define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_movups: +; GENERIC: # BB#0: +; GENERIC-NEXT: movups (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm0, %xmm0 +; GENERIC-NEXT: movups %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movups: +; ATOM: # BB#0: +; ATOM-NEXT: movups (%rdi), %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm0 +; ATOM-NEXT: movups %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movups: +; SLM: # BB#0: +; SLM-NEXT: movups (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: addps %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movups %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movups: +; SANDY: # BB#0: +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movups: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movups: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovups (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <4 x float>, <4 x float> *%a0, align 1 + %2 = fadd <4 x float> %1, %1 + store <4 x float> %2, <4 x float> *%a1, align 1 + ret void +} + +define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_mulps: +; GENERIC: # BB#0: +; GENERIC-NEXT: mulps %xmm1, %xmm0 +; GENERIC-NEXT: mulps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mulps: +; ATOM: # BB#0: +; ATOM-NEXT: mulps %xmm1, %xmm0 +; ATOM-NEXT: mulps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mulps: +; SLM: # BB#0: +; SLM-NEXT: mulps %xmm1, %xmm0 # sched: [5:2.00] +; SLM-NEXT: mulps (%rdi), %xmm0 # sched: [8:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mulps: +; SANDY: # BB#0: +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fmul <4 x float> %a0, %a1 + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = fmul <4 x float> %1, %2 + ret <4 x float> %3 +} + +define float @test_mulss(float %a0, float %a1, float *%a2) { +; GENERIC-LABEL: test_mulss: +; GENERIC: # BB#0: +; GENERIC-NEXT: mulss %xmm1, %xmm0 +; GENERIC-NEXT: mulss (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mulss: +; ATOM: # BB#0: +; ATOM-NEXT: mulss %xmm1, %xmm0 +; ATOM-NEXT: mulss (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mulss: +; SLM: # BB#0: +; SLM-NEXT: mulss %xmm1, %xmm0 # sched: [5:2.00] +; SLM-NEXT: mulss (%rdi), %xmm0 # sched: [8:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mulss: +; SANDY: # BB#0: +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fmul float %a0, %a1 + %2 = load float, float *%a2, align 4 + %3 = fmul float %1, %2 + ret float %3 +} + +define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_orps: +; GENERIC: # BB#0: +; GENERIC-NEXT: orps %xmm1, %xmm0 +; GENERIC-NEXT: orps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_orps: +; ATOM: # BB#0: +; ATOM-NEXT: orps %xmm1, %xmm0 +; ATOM-NEXT: orps (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_orps: +; SLM: # BB#0: +; SLM-NEXT: orps %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: orps (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_orps: +; SANDY: # BB#0: +; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_orps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_orps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x float> %a0 to <4 x i32> + %2 = bitcast <4 x float> %a1 to <4 x i32> + %3 = or <4 x i32> %1, %2 + %4 = load <4 x float>, <4 x float> *%a2, align 16 + %5 = bitcast <4 x float> %4 to <4 x i32> + %6 = or <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <4 x float> + ret <4 x float> %7 +} + +define void @test_prefetchnta(i8* %a0) { +; GENERIC-LABEL: test_prefetchnta: +; GENERIC: # BB#0: +; GENERIC-NEXT: prefetchnta (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_prefetchnta: +; ATOM: # BB#0: +; ATOM-NEXT: prefetchnta (%rdi) +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_prefetchnta: +; SLM: # BB#0: +; SLM-NEXT: prefetchnta (%rdi) # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_prefetchnta: +; SANDY: # BB#0: +; SANDY-NEXT: prefetchnta (%rdi) # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_prefetchnta: +; HASWELL: # BB#0: +; HASWELL-NEXT: prefetchnta (%rdi) # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_prefetchnta: +; BTVER2: # BB#0: +; BTVER2-NEXT: prefetchnta (%rdi) # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1) + ret void +} +declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone + +define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_rcpps: +; GENERIC: # BB#0: +; GENERIC-NEXT: rcpps %xmm0, %xmm1 +; GENERIC-NEXT: rcpps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_rcpps: +; ATOM: # BB#0: +; ATOM-NEXT: rcpps (%rdi), %xmm1 +; ATOM-NEXT: rcpps %xmm0, %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_rcpps: +; SLM: # BB#0: +; SLM-NEXT: rcpps (%rdi), %xmm1 # sched: [8:1.00] +; SLM-NEXT: rcpps %xmm0, %xmm0 # sched: [5:1.00] +; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_rcpps: +; SANDY: # BB#0: +; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rcpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rcpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vrcpps (%rdi), %xmm1 # sched: [7:1.00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +; TODO - rcpss_m + +define <4 x float> @test_rcpss(float %a0, float *%a1) { +; GENERIC-LABEL: test_rcpss: +; GENERIC: # BB#0: +; GENERIC-NEXT: rcpss %xmm0, %xmm0 +; GENERIC-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; GENERIC-NEXT: rcpss %xmm1, %xmm1 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_rcpss: +; ATOM: # BB#0: +; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: rcpss %xmm0, %xmm0 +; ATOM-NEXT: rcpss %xmm1, %xmm1 +; ATOM-NEXT: addps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_rcpss: +; SLM: # BB#0: +; SLM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [3:1.00] +; SLM-NEXT: rcpss %xmm0, %xmm0 # sched: [8:1.00] +; SLM-NEXT: rcpss %xmm1, %xmm1 # sched: [8:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_rcpss: +; SANDY: # BB#0: +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rcpss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rcpss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x float> undef, float %a0, i32 0 + %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1) + %3 = load float, float *%a1, align 4 + %4 = insertelement <4 x float> undef, float %3, i32 0 + %5 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %4) + %6 = fadd <4 x float> %2, %5 + ret <4 x float> %6 +} +declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + +define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_rsqrtps: +; GENERIC: # BB#0: +; GENERIC-NEXT: rsqrtps %xmm0, %xmm1 +; GENERIC-NEXT: rsqrtps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_rsqrtps: +; ATOM: # BB#0: +; ATOM-NEXT: rsqrtps (%rdi), %xmm1 +; ATOM-NEXT: rsqrtps %xmm0, %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_rsqrtps: +; SLM: # BB#0: +; SLM-NEXT: rsqrtps (%rdi), %xmm1 # sched: [8:1.00] +; SLM-NEXT: rsqrtps %xmm0, %xmm0 # sched: [5:1.00] +; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_rsqrtps: +; SANDY: # BB#0: +; SANDY-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rsqrtps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rsqrtps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [7:1.00] +; BTVER2-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +; TODO - rsqrtss_m + +define <4 x float> @test_rsqrtss(float %a0, float *%a1) { +; GENERIC-LABEL: test_rsqrtss: +; GENERIC: # BB#0: +; GENERIC-NEXT: rsqrtss %xmm0, %xmm0 +; GENERIC-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; GENERIC-NEXT: rsqrtss %xmm1, %xmm1 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_rsqrtss: +; ATOM: # BB#0: +; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: rsqrtss %xmm0, %xmm0 +; ATOM-NEXT: rsqrtss %xmm1, %xmm1 +; ATOM-NEXT: addps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_rsqrtss: +; SLM: # BB#0: +; SLM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [3:1.00] +; SLM-NEXT: rsqrtss %xmm0, %xmm0 # sched: [8:1.00] +; SLM-NEXT: rsqrtss %xmm1, %xmm1 # sched: [8:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_rsqrtss: +; SANDY: # BB#0: +; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rsqrtss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rsqrtss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x float> undef, float %a0, i32 0 + %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1) + %3 = load float, float *%a1, align 4 + %4 = insertelement <4 x float> undef, float %3, i32 0 + %5 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %4) + %6 = fadd <4 x float> %2, %5 + ret <4 x float> %6 +} +declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone + +define void @test_sfence() { +; GENERIC-LABEL: test_sfence: +; GENERIC: # BB#0: +; GENERIC-NEXT: sfence +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_sfence: +; ATOM: # BB#0: +; ATOM-NEXT: sfence +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_sfence: +; SLM: # BB#0: +; SLM-NEXT: sfence # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_sfence: +; SANDY: # BB#0: +; SANDY-NEXT: sfence # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sfence: +; HASWELL: # BB#0: +; HASWELL-NEXT: sfence # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sfence: +; BTVER2: # BB#0: +; BTVER2-NEXT: sfence # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.sse.sfence() + ret void +} +declare void @llvm.x86.sse.sfence() nounwind readnone + +define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) nounwind { +; GENERIC-LABEL: test_shufps: +; GENERIC: # BB#0: +; GENERIC-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; GENERIC-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_shufps: +; ATOM: # BB#0: +; ATOM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; ATOM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_shufps: +; SLM: # BB#0: +; SLM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00] +; SLM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_shufps: +; SANDY: # BB#0: +; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00] +; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_shufps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00] +; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_shufps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50] +; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> + ret <4 x float> %3 +} + +define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_sqrtps: +; GENERIC: # BB#0: +; GENERIC-NEXT: sqrtps %xmm0, %xmm1 +; GENERIC-NEXT: sqrtps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_sqrtps: +; ATOM: # BB#0: +; ATOM-NEXT: sqrtps %xmm0, %xmm1 +; ATOM-NEXT: sqrtps (%rdi), %xmm0 +; ATOM-NEXT: addps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_sqrtps: +; SLM: # BB#0: +; SLM-NEXT: sqrtps (%rdi), %xmm1 # sched: [18:1.00] +; SLM-NEXT: sqrtps %xmm0, %xmm0 # sched: [15:1.00] +; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_sqrtps: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00] +; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00] +; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsqrtps (%rdi), %xmm1 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtps %xmm0, %xmm0 # sched: [21:21.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +; TODO - sqrtss_m + +define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_sqrtss: +; GENERIC: # BB#0: +; GENERIC-NEXT: sqrtss %xmm0, %xmm0 +; GENERIC-NEXT: movaps (%rdi), %xmm1 +; GENERIC-NEXT: sqrtss %xmm1, %xmm1 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_sqrtss: +; ATOM: # BB#0: +; ATOM-NEXT: movaps (%rdi), %xmm1 +; ATOM-NEXT: sqrtss %xmm0, %xmm0 +; ATOM-NEXT: sqrtss %xmm1, %xmm1 +; ATOM-NEXT: addps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_sqrtss: +; SLM: # BB#0: +; SLM-NEXT: movaps (%rdi), %xmm1 # sched: [3:1.00] +; SLM-NEXT: sqrtss %xmm0, %xmm0 # sched: [18:1.00] +; SLM-NEXT: sqrtss %xmm1, %xmm1 # sched: [18:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_sqrtss: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00] +; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50] +; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00] +; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50] +; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps (%rdi), %xmm1 # sched: [5:1.00] +; BTVER2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone + +define i32 @test_stmxcsr() { +; GENERIC-LABEL: test_stmxcsr: +; GENERIC: # BB#0: +; GENERIC-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; GENERIC-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_stmxcsr: +; ATOM: # BB#0: +; ATOM-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; ATOM-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_stmxcsr: +; SLM: # BB#0: +; SLM-NEXT: stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; SLM-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_stmxcsr: +; SANDY: # BB#0: +; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_stmxcsr: +; HASWELL: # BB#0: +; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00] +; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_stmxcsr: +; BTVER2: # BB#0: +; BTVER2-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; BTVER2-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1, align 4 + ret i32 %3 +} +declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone + +define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_subps: +; GENERIC: # BB#0: +; GENERIC-NEXT: subps %xmm1, %xmm0 +; GENERIC-NEXT: subps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_subps: +; ATOM: # BB#0: +; ATOM-NEXT: subps %xmm1, %xmm0 +; ATOM-NEXT: subps (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_subps: +; SLM: # BB#0: +; SLM-NEXT: subps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: subps (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_subps: +; SANDY: # BB#0: +; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fsub <4 x float> %a0, %a1 + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = fsub <4 x float> %1, %2 + ret <4 x float> %3 +} + +define float @test_subss(float %a0, float %a1, float *%a2) { +; GENERIC-LABEL: test_subss: +; GENERIC: # BB#0: +; GENERIC-NEXT: subss %xmm1, %xmm0 +; GENERIC-NEXT: subss (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_subss: +; ATOM: # BB#0: +; ATOM-NEXT: subss %xmm1, %xmm0 +; ATOM-NEXT: subss (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_subss: +; SLM: # BB#0: +; SLM-NEXT: subss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: subss (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_subss: +; SANDY: # BB#0: +; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fsub float %a0, %a1 + %2 = load float, float *%a2, align 4 + %3 = fsub float %1, %2 + ret float %3 +} + +define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_ucomiss: +; GENERIC: # BB#0: +; GENERIC-NEXT: ucomiss %xmm1, %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %cl +; GENERIC-NEXT: andb %al, %cl +; GENERIC-NEXT: ucomiss (%rdi), %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %dl +; GENERIC-NEXT: andb %al, %dl +; GENERIC-NEXT: orb %cl, %dl +; GENERIC-NEXT: movzbl %dl, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_ucomiss: +; ATOM: # BB#0: +; ATOM-NEXT: ucomiss %xmm1, %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %cl +; ATOM-NEXT: andb %al, %cl +; ATOM-NEXT: ucomiss (%rdi), %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %dl +; ATOM-NEXT: andb %al, %dl +; ATOM-NEXT: orb %cl, %dl +; ATOM-NEXT: movzbl %dl, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_ucomiss: +; SLM: # BB#0: +; SLM-NEXT: ucomiss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %cl # sched: [1:0.50] +; SLM-NEXT: andb %al, %cl # sched: [1:0.50] +; SLM-NEXT: ucomiss (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %dl # sched: [1:0.50] +; SLM-NEXT: andb %al, %dl # sched: [1:0.50] +; SLM-NEXT: orb %cl, %dl # sched: [1:0.50] +; SLM-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ucomiss: +; SANDY: # BB#0: +; SANDY-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] +; SANDY-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] +; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] +; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_ucomiss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %cl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] +; HASWELL-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %dl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] +; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] +; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ucomiss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %cl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] +; BTVER2-NEXT: vucomiss (%rdi), %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %dl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %dl # sched: [1:0.50] +; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] +; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 4 + %3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2) + %4 = or i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_unpckhps: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_unpckhps: +; ATOM: # BB#0: +; ATOM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ATOM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_unpckhps: +; SLM: # BB#0: +; SLM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SLM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_unpckhps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpckhps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpckhps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> + ret <4 x float> %3 +} + +define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_unpcklps: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_unpcklps: +; ATOM: # BB#0: +; ATOM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; ATOM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_unpcklps: +; SLM: # BB#0: +; SLM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SLM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_unpcklps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpcklps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpcklps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> + ret <4 x float> %3 +} + +define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; GENERIC-LABEL: test_xorps: +; GENERIC: # BB#0: +; GENERIC-NEXT: xorps %xmm1, %xmm0 +; GENERIC-NEXT: xorps (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_xorps: +; ATOM: # BB#0: +; ATOM-NEXT: xorps %xmm1, %xmm0 +; ATOM-NEXT: xorps (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_xorps: +; SLM: # BB#0: +; SLM-NEXT: xorps %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: xorps (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_xorps: +; SANDY: # BB#0: +; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_xorps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_xorps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x float> %a0 to <4 x i32> + %2 = bitcast <4 x float> %a1 to <4 x i32> + %3 = xor <4 x i32> %1, %2 + %4 = load <4 x float>, <4 x float> *%a2, align 16 + %5 = bitcast <4 x float> %4 to <4 x i32> + %6 = xor <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <4 x float> + ret <4 x float> %7 +} + +!0 = !{i32 1} diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll new file mode 100644 index 000000000000..33a4f413b683 --- /dev/null +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -0,0 +1,6039 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_addpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: addpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addpd: +; ATOM: # BB#0: +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: addpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addpd: +; SLM: # BB#0: +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fadd <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_addsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_addsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: addsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addsd: +; ATOM: # BB#0: +; ATOM-NEXT: addsd %xmm1, %xmm0 +; ATOM-NEXT: addsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addsd: +; SLM: # BB#0: +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addsd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fadd double %1, %2 + ret double %3 +} + +define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_andpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: andpd %xmm1, %xmm0 +; GENERIC-NEXT: andpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_andpd: +; ATOM: # BB#0: +; ATOM-NEXT: andpd %xmm1, %xmm0 +; ATOM-NEXT: andpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_andpd: +; SLM: # BB#0: +; SLM-NEXT: andpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: andpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_andpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = and <4 x i32> %1, %2 + %4 = load <2 x double>, <2 x double> *%a2, align 16 + %5 = bitcast <2 x double> %4 to <4 x i32> + %6 = and <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <2 x double> + %8 = fadd <2 x double> %a1, %7 + ret <2 x double> %8 +} + +define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_andnotpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnpd %xmm1, %xmm0 +; GENERIC-NEXT: andnpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_andnotpd: +; ATOM: # BB#0: +; ATOM-NEXT: andnpd %xmm1, %xmm0 +; ATOM-NEXT: andnpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_andnotpd: +; SLM: # BB#0: +; SLM-NEXT: andnpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: andnpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_andnotpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andnotpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andnotpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = xor <4 x i32> %1, + %4 = and <4 x i32> %3, %2 + %5 = load <2 x double>, <2 x double> *%a2, align 16 + %6 = bitcast <2 x double> %5 to <4 x i32> + %7 = xor <4 x i32> %4, + %8 = and <4 x i32> %6, %7 + %9 = bitcast <4 x i32> %8 to <2 x double> + %10 = fadd <2 x double> %a1, %9 + ret <2 x double> %10 +} + +define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_cmppd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpeqpd %xmm0, %xmm1 +; GENERIC-NEXT: cmpeqpd (%rdi), %xmm0 +; GENERIC-NEXT: orpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cmppd: +; ATOM: # BB#0: +; ATOM-NEXT: cmpeqpd %xmm0, %xmm1 +; ATOM-NEXT: cmpeqpd (%rdi), %xmm0 +; ATOM-NEXT: orpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cmppd: +; SLM: # BB#0: +; SLM-NEXT: cmpeqpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: cmpeqpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cmppd: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmppd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmppd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fcmp oeq <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fcmp oeq <2 x double> %a0, %2 + %4 = or <2 x i1> %1, %3 + %5 = sext <2 x i1> %4 to <2 x i64> + %6 = bitcast <2 x i64> %5 to <2 x double> + ret <2 x double> %6 +} + +define double @test_cmpsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_cmpsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpeqsd %xmm1, %xmm0 +; GENERIC-NEXT: cmpeqsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cmpsd: +; ATOM: # BB#0: +; ATOM-NEXT: cmpeqsd %xmm1, %xmm0 +; ATOM-NEXT: cmpeqsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cmpsd: +; SLM: # BB#0: +; SLM-NEXT: cmpeqsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: cmpeqsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cmpsd: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmpsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmpsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x double> undef, double %a0, i32 0 + %2 = insertelement <2 x double> undef, double %a1, i32 0 + %3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0) + %4 = load double, double *%a2, align 8 + %5 = insertelement <2 x double> undef, double %4, i32 0 + %6 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %3, <2 x double> %5, i8 0) + %7 = extractelement <2 x double> %6, i32 0 + ret double %7 +} +declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone + +define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_comisd: +; GENERIC: # BB#0: +; GENERIC-NEXT: comisd %xmm1, %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %cl +; GENERIC-NEXT: andb %al, %cl +; GENERIC-NEXT: comisd (%rdi), %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %dl +; GENERIC-NEXT: andb %al, %dl +; GENERIC-NEXT: orb %cl, %dl +; GENERIC-NEXT: movzbl %dl, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_comisd: +; ATOM: # BB#0: +; ATOM-NEXT: comisd %xmm1, %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %cl +; ATOM-NEXT: andb %al, %cl +; ATOM-NEXT: comisd (%rdi), %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %dl +; ATOM-NEXT: andb %al, %dl +; ATOM-NEXT: orb %cl, %dl +; ATOM-NEXT: movzbl %dl, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_comisd: +; SLM: # BB#0: +; SLM-NEXT: comisd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %cl # sched: [1:0.50] +; SLM-NEXT: andb %al, %cl # sched: [1:0.50] +; SLM-NEXT: comisd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %dl # sched: [1:0.50] +; SLM-NEXT: andb %al, %dl # sched: [1:0.50] +; SLM-NEXT: orb %cl, %dl # sched: [1:0.50] +; SLM-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_comisd: +; SANDY: # BB#0: +; SANDY-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] +; SANDY-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] +; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] +; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_comisd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %cl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] +; HASWELL-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %dl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] +; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] +; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_comisd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %cl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] +; BTVER2-NEXT: vcomisd (%rdi), %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %dl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %dl # sched: [1:0.50] +; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] +; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 8 + %3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2) + %4 = or i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { +; GENERIC-LABEL: test_cvtdq2pd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtdq2pd %xmm0, %xmm1 +; GENERIC-NEXT: cvtdq2pd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtdq2pd: +; ATOM: # BB#0: +; ATOM-NEXT: cvtdq2pd %xmm0, %xmm1 +; ATOM-NEXT: cvtdq2pd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtdq2pd: +; SLM: # BB#0: +; SLM-NEXT: cvtdq2pd %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtdq2pd (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtdq2pd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2pd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2pd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> + %2 = sitofp <2 x i32> %1 to <2 x double> + %3 = load <4 x i32>, <4 x i32>*%a1, align 16 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> + %5 = sitofp <2 x i32> %4 to <2 x double> + %6 = fadd <2 x double> %2, %5 + ret <2 x double> %6 +} + +define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) { +; GENERIC-LABEL: test_cvtdq2ps: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtdq2ps %xmm0, %xmm1 +; GENERIC-NEXT: cvtdq2ps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtdq2ps: +; ATOM: # BB#0: +; ATOM-NEXT: cvtdq2ps (%rdi), %xmm1 +; ATOM-NEXT: cvtdq2ps %xmm0, %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtdq2ps: +; SLM: # BB#0: +; SLM-NEXT: cvtdq2ps %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtdq2ps (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtdq2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp <4 x i32> %a0 to <4 x float> + %2 = load <4 x i32>, <4 x i32>*%a1, align 16 + %3 = sitofp <4 x i32> %2 to <4 x float> + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} + +define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_cvtpd2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtpd2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvtpd2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtpd2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtpd2dq (%rdi), %xmm1 +; ATOM-NEXT: cvtpd2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtpd2dq: +; SLM: # BB#0: +; SLM-NEXT: cvtpd2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtpd2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtpd2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2) + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone + +define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_cvtpd2ps: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtpd2ps %xmm0, %xmm1 +; GENERIC-NEXT: cvtpd2ps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtpd2ps: +; ATOM: # BB#0: +; ATOM-NEXT: cvtpd2ps (%rdi), %xmm1 +; ATOM-NEXT: cvtpd2ps %xmm0, %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtpd2ps: +; SLM: # BB#0: +; SLM-NEXT: cvtpd2ps %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtpd2ps (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtpd2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone + +define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_cvtps2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtps2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvtps2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtps2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtps2dq (%rdi), %xmm1 +; ATOM-NEXT: cvtps2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtps2dq: +; SLM: # BB#0: +; SLM-NEXT: cvtps2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtps2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtps2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtps2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtps2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2) + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone + +define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_cvtps2pd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtps2pd %xmm0, %xmm1 +; GENERIC-NEXT: cvtps2pd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtps2pd: +; ATOM: # BB#0: +; ATOM-NEXT: cvtps2pd (%rdi), %xmm1 +; ATOM-NEXT: cvtps2pd %xmm0, %xmm0 +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movapd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtps2pd: +; SLM: # BB#0: +; SLM-NEXT: cvtps2pd %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtps2pd (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtps2pd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtps2pd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtps2pd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> + %2 = fpext <2 x float> %1 to <2 x double> + %3 = load <4 x float>, <4 x float> *%a1, align 16 + %4 = shufflevector <4 x float> %3, <4 x float> undef, <2 x i32> + %5 = fpext <2 x float> %4 to <2 x double> + %6 = fadd <2 x double> %2, %5 + ret <2 x double> %6 +} + +define i32 @test_cvtsd2si(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvtsd2si: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsd2si %xmm0, %ecx +; GENERIC-NEXT: cvtsd2si (%rdi), %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsd2si: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsd2si (%rdi), %eax +; ATOM-NEXT: cvtsd2si %xmm0, %ecx +; ATOM-NEXT: addl %ecx, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsd2si: +; SLM: # BB#0: +; SLM-NEXT: cvtsd2si (%rdi), %eax # sched: [7:1.00] +; SLM-NEXT: cvtsd2si %xmm0, %ecx # sched: [4:0.50] +; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsd2si: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] +; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsd2si: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00] +; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsd2si: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00] +; BTVER2-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x double> undef, double %a0, i32 0 + %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1) + %3 = load double, double *%a1, align 8 + %4 = insertelement <2 x double> undef, double %3, i32 0 + %5 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %4) + %6 = add i32 %2, %5 + ret i32 %6 +} +declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone + +define i64 @test_cvtsd2siq(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvtsd2siq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsd2si %xmm0, %rcx +; GENERIC-NEXT: cvtsd2si (%rdi), %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsd2siq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsd2si (%rdi), %rax +; ATOM-NEXT: cvtsd2si %xmm0, %rcx +; ATOM-NEXT: addq %rcx, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsd2siq: +; SLM: # BB#0: +; SLM-NEXT: cvtsd2si (%rdi), %rax # sched: [7:1.00] +; SLM-NEXT: cvtsd2si %xmm0, %rcx # sched: [4:0.50] +; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsd2siq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] +; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsd2siq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00] +; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsd2siq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00] +; BTVER2-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x double> undef, double %a0, i32 0 + %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1) + %3 = load double, double *%a1, align 8 + %4 = insertelement <2 x double> undef, double %3, i32 0 + %5 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %4) + %6 = add i64 %2, %5 + ret i64 %6 +} +declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone + +define float @test_cvtsd2ss(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvtsd2ss: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm1 +; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm0 +; GENERIC-NEXT: addss %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsd2ss: +; ATOM: # BB#0: +; ATOM-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; ATOM-NEXT: cvtsd2ss %xmm0, %xmm2 +; ATOM-NEXT: xorps %xmm0, %xmm0 +; ATOM-NEXT: cvtsd2ss %xmm1, %xmm0 +; ATOM-NEXT: addss %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsd2ss: +; SLM: # BB#0: +; SLM-NEXT: cvtsd2ss %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: cvtsd2ss %xmm0, %xmm0 # sched: [4:0.50] +; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsd2ss: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsd2ss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00] +; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsd2ss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptrunc double %a0 to float + %2 = load double, double *%a1, align 8 + %3 = fptrunc double %2 to float + %4 = fadd float %1, %3 + ret float %4 +} + +define double @test_cvtsi2sd(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_cvtsi2sd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsi2sdl %edi, %xmm1 +; GENERIC-NEXT: cvtsi2sdl (%rsi), %xmm0 +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsi2sd: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsi2sdl (%rsi), %xmm0 +; ATOM-NEXT: cvtsi2sdl %edi, %xmm1 +; ATOM-NEXT: addsd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsi2sd: +; SLM: # BB#0: +; SLM-NEXT: cvtsi2sdl (%rsi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: cvtsi2sdl %edi, %xmm1 # sched: [4:0.50] +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsi2sd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsi2sd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsi2sd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp i32 %a0 to double + %2 = load i32, i32 *%a1, align 8 + %3 = sitofp i32 %2 to double + %4 = fadd double %1, %3 + ret double %4 +} + +define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_cvtsi2sdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsi2sdq %rdi, %xmm1 +; GENERIC-NEXT: cvtsi2sdq (%rsi), %xmm0 +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsi2sdq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsi2sdq (%rsi), %xmm0 +; ATOM-NEXT: cvtsi2sdq %rdi, %xmm1 +; ATOM-NEXT: addsd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsi2sdq: +; SLM: # BB#0: +; SLM-NEXT: cvtsi2sdq (%rsi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: cvtsi2sdq %rdi, %xmm1 # sched: [4:0.50] +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsi2sdq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsi2sdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsi2sdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp i64 %a0 to double + %2 = load i64, i64 *%a1, align 8 + %3 = sitofp i64 %2 to double + %4 = fadd double %1, %3 + ret double %4 +} + +; TODO - cvtss2sd_m + +define double @test_cvtss2sd(float %a0, float *%a1) { +; GENERIC-LABEL: test_cvtss2sd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtss2sd %xmm0, %xmm1 +; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; GENERIC-NEXT: cvtss2sd %xmm0, %xmm0 +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtss2sd: +; ATOM: # BB#0: +; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: cvtss2sd %xmm0, %xmm2 +; ATOM-NEXT: xorps %xmm0, %xmm0 +; ATOM-NEXT: cvtss2sd %xmm1, %xmm0 +; ATOM-NEXT: addsd %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtss2sd: +; SLM: # BB#0: +; SLM-NEXT: cvtss2sd %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [3:1.00] +; SLM-NEXT: cvtss2sd %xmm0, %xmm0 # sched: [4:0.50] +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtss2sd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtss2sd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00] +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtss2sd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fpext float %a0 to double + %2 = load float, float *%a1, align 4 + %3 = fpext float %2 to double + %4 = fadd double %1, %3 + ret double %4 +} + +define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_cvttpd2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttpd2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvttpd2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttpd2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvttpd2dq (%rdi), %xmm1 +; ATOM-NEXT: cvttpd2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttpd2dq: +; SLM: # BB#0: +; SLM-NEXT: cvttpd2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvttpd2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttpd2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttpd2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttpd2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi <2 x double> %a0 to <2 x i32> + %2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> + %3 = load <2 x double>, <2 x double> *%a1, align 16 + %4 = fptosi <2 x double> %3 to <2 x i32> + %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> + %6 = add <4 x i32> %2, %5 + ret <4 x i32> %6 +} + +define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_cvttps2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttps2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvttps2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttps2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvttps2dq (%rdi), %xmm1 +; ATOM-NEXT: cvttps2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttps2dq: +; SLM: # BB#0: +; SLM-NEXT: cvttps2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvttps2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttps2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttps2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttps2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi <4 x float> %a0 to <4 x i32> + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = fptosi <4 x float> %2 to <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define i32 @test_cvttsd2si(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvttsd2si: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttsd2si %xmm0, %ecx +; GENERIC-NEXT: cvttsd2si (%rdi), %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttsd2si: +; ATOM: # BB#0: +; ATOM-NEXT: cvttsd2si (%rdi), %eax +; ATOM-NEXT: cvttsd2si %xmm0, %ecx +; ATOM-NEXT: addl %ecx, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttsd2si: +; SLM: # BB#0: +; SLM-NEXT: cvttsd2si (%rdi), %eax # sched: [7:1.00] +; SLM-NEXT: cvttsd2si %xmm0, %ecx # sched: [4:0.50] +; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttsd2si: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] +; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttsd2si: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00] +; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttsd2si: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00] +; BTVER2-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi double %a0 to i32 + %2 = load double, double *%a1, align 8 + %3 = fptosi double %2 to i32 + %4 = add i32 %1, %3 + ret i32 %4 +} + +define i64 @test_cvttsd2siq(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvttsd2siq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttsd2si %xmm0, %rcx +; GENERIC-NEXT: cvttsd2si (%rdi), %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttsd2siq: +; ATOM: # BB#0: +; ATOM-NEXT: cvttsd2si (%rdi), %rax +; ATOM-NEXT: cvttsd2si %xmm0, %rcx +; ATOM-NEXT: addq %rcx, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttsd2siq: +; SLM: # BB#0: +; SLM-NEXT: cvttsd2si (%rdi), %rax # sched: [7:1.00] +; SLM-NEXT: cvttsd2si %xmm0, %rcx # sched: [4:0.50] +; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttsd2siq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] +; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttsd2siq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00] +; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttsd2siq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00] +; BTVER2-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi double %a0 to i64 + %2 = load double, double *%a1, align 8 + %3 = fptosi double %2 to i64 + %4 = add i64 %1, %3 + ret i64 %4 +} + +define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_divpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: divpd %xmm1, %xmm0 +; GENERIC-NEXT: divpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_divpd: +; ATOM: # BB#0: +; ATOM-NEXT: divpd %xmm1, %xmm0 +; ATOM-NEXT: divpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_divpd: +; SLM: # BB#0: +; SLM-NEXT: divpd %xmm1, %xmm0 # sched: [34:34.00] +; SLM-NEXT: divpd (%rdi), %xmm0 # sched: [37:34.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_divpd: +; SANDY: # BB#0: +; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fdiv <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fdiv <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_divsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_divsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: divsd %xmm1, %xmm0 +; GENERIC-NEXT: divsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_divsd: +; ATOM: # BB#0: +; ATOM-NEXT: divsd %xmm1, %xmm0 +; ATOM-NEXT: divsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_divsd: +; SLM: # BB#0: +; SLM-NEXT: divsd %xmm1, %xmm0 # sched: [34:34.00] +; SLM-NEXT: divsd (%rdi), %xmm0 # sched: [37:34.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_divsd: +; SANDY: # BB#0: +; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fdiv double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fdiv double %1, %2 + ret double %3 +} + +define void @test_lfence() { +; GENERIC-LABEL: test_lfence: +; GENERIC: # BB#0: +; GENERIC-NEXT: lfence +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_lfence: +; ATOM: # BB#0: +; ATOM-NEXT: lfence +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lfence: +; SLM: # BB#0: +; SLM-NEXT: lfence # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lfence: +; SANDY: # BB#0: +; SANDY-NEXT: lfence # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_lfence: +; HASWELL: # BB#0: +; HASWELL-NEXT: lfence # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lfence: +; BTVER2: # BB#0: +; BTVER2-NEXT: lfence # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.sse2.lfence() + ret void +} +declare void @llvm.x86.sse2.lfence() nounwind readnone + +define void @test_mfence() { +; GENERIC-LABEL: test_mfence: +; GENERIC: # BB#0: +; GENERIC-NEXT: mfence +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mfence: +; ATOM: # BB#0: +; ATOM-NEXT: mfence +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mfence: +; SLM: # BB#0: +; SLM-NEXT: mfence # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mfence: +; SANDY: # BB#0: +; SANDY-NEXT: mfence # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mfence: +; HASWELL: # BB#0: +; HASWELL-NEXT: mfence # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mfence: +; BTVER2: # BB#0: +; BTVER2-NEXT: mfence # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.sse2.mfence() + ret void +} +declare void @llvm.x86.sse2.mfence() nounwind readnone + +define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) { +; GENERIC-LABEL: test_maskmovdqu: +; GENERIC: # BB#0: +; GENERIC-NEXT: maskmovdqu %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maskmovdqu: +; ATOM: # BB#0: +; ATOM-NEXT: maskmovdqu %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maskmovdqu: +; SLM: # BB#0: +; SLM-NEXT: maskmovdqu %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maskmovdqu: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovdqu: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovdqu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) + ret void +} +declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind + +define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_maxpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: maxpd %xmm1, %xmm0 +; GENERIC-NEXT: maxpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maxpd: +; ATOM: # BB#0: +; ATOM-NEXT: maxpd %xmm1, %xmm0 +; ATOM-NEXT: maxpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maxpd: +; SLM: # BB#0: +; SLM-NEXT: maxpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: maxpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maxpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_maxsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: maxsd %xmm1, %xmm0 +; GENERIC-NEXT: maxsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maxsd: +; ATOM: # BB#0: +; ATOM-NEXT: maxsd %xmm1, %xmm0 +; ATOM-NEXT: maxsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maxsd: +; SLM: # BB#0: +; SLM-NEXT: maxsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: maxsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maxsd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_minpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: minpd %xmm1, %xmm0 +; GENERIC-NEXT: minpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_minpd: +; ATOM: # BB#0: +; ATOM-NEXT: minpd %xmm1, %xmm0 +; ATOM-NEXT: minpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_minpd: +; SLM: # BB#0: +; SLM-NEXT: minpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: minpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_minpd: +; SANDY: # BB#0: +; SANDY-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_minsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: minsd %xmm1, %xmm0 +; GENERIC-NEXT: minsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_minsd: +; ATOM: # BB#0: +; ATOM-NEXT: minsd %xmm1, %xmm0 +; ATOM-NEXT: minsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_minsd: +; SLM: # BB#0: +; SLM-NEXT: minsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: minsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_minsd: +; SANDY: # BB#0: +; SANDY-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone + +define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_movapd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movapd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm0, %xmm0 +; GENERIC-NEXT: movapd %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movapd: +; ATOM: # BB#0: +; ATOM-NEXT: movapd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm0, %xmm0 +; ATOM-NEXT: movapd %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movapd: +; SLM: # BB#0: +; SLM-NEXT: movapd (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movapd %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movapd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movapd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movapd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovapd (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = fadd <2 x double> %1, %1 + store <2 x double> %2, <2 x double> *%a1, align 16 + ret void +} + +define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) { +; GENERIC-LABEL: test_movdqa: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm0, %xmm0 +; GENERIC-NEXT: movdqa %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movdqa: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm0, %xmm0 +; ATOM-NEXT: movdqa %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movdqa: +; SLM: # BB#0: +; SLM-NEXT: movdqa (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movdqa: +; SANDY: # BB#0: +; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movdqa: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movdqa: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovdqa (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x i64>, <2 x i64> *%a0, align 16 + %2 = add <2 x i64> %1, %1 + store <2 x i64> %2, <2 x i64> *%a1, align 16 + ret void +} + +define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) { +; GENERIC-LABEL: test_movdqu: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqu (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm0, %xmm0 +; GENERIC-NEXT: movdqu %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movdqu: +; ATOM: # BB#0: +; ATOM-NEXT: movdqu (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm0, %xmm0 +; ATOM-NEXT: movdqu %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movdqu: +; SLM: # BB#0: +; SLM-NEXT: movdqu (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movdqu %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movdqu: +; SANDY: # BB#0: +; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movdqu: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movdqu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovdqu (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x i64>, <2 x i64> *%a0, align 1 + %2 = add <2 x i64> %1, %1 + store <2 x i64> %2, <2 x i64> *%a1, align 1 + ret void +} + +define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_movd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movd %edi, %xmm1 +; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: paddd %xmm0, %xmm1 +; GENERIC-NEXT: paddd %xmm0, %xmm2 +; GENERIC-NEXT: movd %xmm2, %eax +; GENERIC-NEXT: movd %xmm1, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movd: +; ATOM: # BB#0: +; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movd %xmm1, %eax +; ATOM-NEXT: movd %edi, %xmm1 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movd %xmm1, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movd: +; SLM: # BB#0: +; SLM-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [3:1.00] +; SLM-NEXT: movd %edi, %xmm1 # sched: [1:0.50] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movd %xmm1, (%rsi) # sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: movd %xmm2, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:0.33] +; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovd %xmm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00] +; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovd %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vmovd %edi, %xmm1 # sched: [1:0.17] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %2 = load i32, i32 *%a2 + %3 = insertelement <4 x i32> undef, i32 %2, i32 0 + %4 = add <4 x i32> %a0, %1 + %5 = add <4 x i32> %a0, %3 + %6 = extractelement <4 x i32> %4, i32 0 + %7 = extractelement <4 x i32> %5, i32 0 + store i32 %6, i32* %a2 + ret i32 %7 +} + +define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_movd_64: +; GENERIC: # BB#0: +; GENERIC-NEXT: movd %rdi, %xmm1 +; GENERIC-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; GENERIC-NEXT: paddq %xmm0, %xmm1 +; GENERIC-NEXT: paddq %xmm0, %xmm2 +; GENERIC-NEXT: movd %xmm2, %rax +; GENERIC-NEXT: movq %xmm1, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movd_64: +; ATOM: # BB#0: +; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; ATOM-NEXT: movd %rdi, %xmm2 +; ATOM-NEXT: paddq %xmm0, %xmm2 +; ATOM-NEXT: paddq %xmm0, %xmm1 +; ATOM-NEXT: movq %xmm2, (%rsi) +; ATOM-NEXT: movd %xmm1, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movd_64: +; SLM: # BB#0: +; SLM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: movd %rdi, %xmm1 # sched: [1:0.50] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movq %xmm1, (%rsi) # sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: movd %xmm2, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movd_64: +; SANDY: # BB#0: +; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.33] +; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovq %xmm0, %rax # sched: [1:0.33] +; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movd_64: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00] +; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovq %xmm0, %rax # sched: [1:1.00] +; HASWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movd_64: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.17] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x i64> undef, i64 %a1, i64 0 + %2 = load i64, i64 *%a2 + %3 = insertelement <2 x i64> undef, i64 %2, i64 0 + %4 = add <2 x i64> %a0, %1 + %5 = add <2 x i64> %a0, %3 + %6 = extractelement <2 x i64> %4, i64 0 + %7 = extractelement <2 x i64> %5, i64 0 + store i64 %6, i64* %a2 + ret i64 %7 +} + +define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { +; GENERIC-LABEL: test_movhpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; GENERIC-NEXT: addpd %xmm0, %xmm1 +; GENERIC-NEXT: movhpd %xmm1, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movhpd: +; ATOM: # BB#0: +; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movhpd %xmm1, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movhpd: +; SLM: # BB#0: +; SLM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movhpd %xmm1, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movhpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movhpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movhpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast x86_mmx* %a2 to double* + %2 = load double, double *%1, align 8 + %3 = insertelement <2 x double> %a1, double %2, i32 1 + %4 = fadd <2 x double> %a0, %3 + %5 = extractelement <2 x double> %4, i32 1 + store double %5, double* %1 + ret void +} + +define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { +; GENERIC-LABEL: test_movlpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; GENERIC-NEXT: addpd %xmm0, %xmm1 +; GENERIC-NEXT: movlpd %xmm1, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movlpd: +; ATOM: # BB#0: +; ATOM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movlpd %xmm1, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movlpd: +; SLM: # BB#0: +; SLM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [4:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movlpd %xmm1, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movlpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movlpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movlpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast x86_mmx* %a2 to double* + %2 = load double, double *%1, align 8 + %3 = insertelement <2 x double> %a1, double %2, i32 0 + %4 = fadd <2 x double> %a0, %3 + %5 = extractelement <2 x double> %4, i32 0 + store double %5, double* %1 + ret void +} + +define i32 @test_movmskpd(<2 x double> %a0) { +; GENERIC-LABEL: test_movmskpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movmskpd %xmm0, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movmskpd: +; ATOM: # BB#0: +; ATOM-NEXT: movmskpd %xmm0, %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movmskpd: +; SLM: # BB#0: +; SLM-NEXT: movmskpd %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movmskpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movmskpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movmskpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone + +define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) { +; GENERIC-LABEL: test_movntdqa: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddq %xmm0, %xmm0 +; GENERIC-NEXT: movntdq %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movntdqa: +; ATOM: # BB#0: +; ATOM-NEXT: paddq %xmm0, %xmm0 +; ATOM-NEXT: movntdq %xmm0, (%rdi) +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movntdqa: +; SLM: # BB#0: +; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movntdq %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movntdqa: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntdqa: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntdqa: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <2 x i64> %a0, %a0 + store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0 + ret void +} + +define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_movntpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: addpd %xmm0, %xmm0 +; GENERIC-NEXT: movntpd %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movntpd: +; ATOM: # BB#0: +; ATOM-NEXT: addpd %xmm0, %xmm0 +; ATOM-NEXT: movntpd %xmm0, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movntpd: +; SLM: # BB#0: +; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movntpd %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movntpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd <2 x double> %a0, %a0 + store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0 + ret void +} + +define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) { +; GENERIC-LABEL: test_movq_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: movq %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movq_mem: +; ATOM: # BB#0: +; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: movq %xmm0, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movq_mem: +; SLM: # BB#0: +; SLM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movq %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movq_mem: +; SANDY: # BB#0: +; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movq_mem: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movq_mem: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load i64, i64* %a1, align 1 + %2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0 + %3 = add <2 x i64> %a0, %2 + %4 = extractelement <2 x i64> %3, i32 0 + store i64 %4, i64 *%a1, align 1 + ret <2 x i64> %3 +} + +define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) { +; GENERIC-LABEL: test_movq_reg: +; GENERIC: # BB#0: +; GENERIC-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movq_reg: +; ATOM: # BB#0: +; ATOM-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movq_reg: +; SLM: # BB#0: +; SLM-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movq_reg: +; SANDY: # BB#0: +; SANDY-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33] +; SANDY-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movq_reg: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33] +; HASWELL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movq_reg: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> + %2 = add <2 x i64> %a1, %1 + ret <2 x i64> %2 +} + +define void @test_movsd_mem(double* %a0, double* %a1) { +; GENERIC-LABEL: test_movsd_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; GENERIC-NEXT: addsd %xmm0, %xmm0 +; GENERIC-NEXT: movsd %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movsd_mem: +; ATOM: # BB#0: +; ATOM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; ATOM-NEXT: addsd %xmm0, %xmm0 +; ATOM-NEXT: movsd %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movsd_mem: +; SLM: # BB#0: +; SLM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: addsd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movsd %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movsd_mem: +; SANDY: # BB#0: +; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movsd_mem: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movsd_mem: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load double, double* %a0, align 1 + %2 = fadd double %1, %1 + store double %2, double *%a1, align 1 + ret void +} + +define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) { +; GENERIC-LABEL: test_movsd_reg: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; GENERIC-NEXT: movapd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movsd_reg: +; ATOM: # BB#0: +; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; ATOM-NEXT: movapd %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movsd_reg: +; SLM: # BB#0: +; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movsd_reg: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movsd_reg: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movsd_reg: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + ret <2 x double> %1 +} + +define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_movupd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movupd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm0, %xmm0 +; GENERIC-NEXT: movupd %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movupd: +; ATOM: # BB#0: +; ATOM-NEXT: movupd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm0, %xmm0 +; ATOM-NEXT: movupd %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movupd: +; SLM: # BB#0: +; SLM-NEXT: movupd (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movupd %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movupd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movupd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movupd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovupd (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x double>, <2 x double> *%a0, align 1 + %2 = fadd <2 x double> %1, %1 + store <2 x double> %2, <2 x double> *%a1, align 1 + ret void +} + +define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_mulpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: mulpd %xmm1, %xmm0 +; GENERIC-NEXT: mulpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mulpd: +; ATOM: # BB#0: +; ATOM-NEXT: mulpd %xmm1, %xmm0 +; ATOM-NEXT: mulpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mulpd: +; SLM: # BB#0: +; SLM-NEXT: mulpd %xmm1, %xmm0 # sched: [5:2.00] +; SLM-NEXT: mulpd (%rdi), %xmm0 # sched: [8:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mulpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fmul <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fmul <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_mulsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_mulsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: mulsd %xmm1, %xmm0 +; GENERIC-NEXT: mulsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mulsd: +; ATOM: # BB#0: +; ATOM-NEXT: mulsd %xmm1, %xmm0 +; ATOM-NEXT: mulsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mulsd: +; SLM: # BB#0: +; SLM-NEXT: mulsd %xmm1, %xmm0 # sched: [5:2.00] +; SLM-NEXT: mulsd (%rdi), %xmm0 # sched: [8:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mulsd: +; SANDY: # BB#0: +; SANDY-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fmul double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fmul double %1, %2 + ret double %3 +} + +define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_orpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: orpd %xmm1, %xmm0 +; GENERIC-NEXT: orpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_orpd: +; ATOM: # BB#0: +; ATOM-NEXT: orpd %xmm1, %xmm0 +; ATOM-NEXT: orpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_orpd: +; SLM: # BB#0: +; SLM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: orpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_orpd: +; SANDY: # BB#0: +; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_orpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_orpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = or <4 x i32> %1, %2 + %4 = load <2 x double>, <2 x double> *%a2, align 16 + %5 = bitcast <2 x double> %4 to <4 x i32> + %6 = or <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <2 x double> + %8 = fadd <2 x double> %a1, %7 + ret <2 x double> %8 +} + +define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_packssdw: +; GENERIC: # BB#0: +; GENERIC-NEXT: packssdw %xmm1, %xmm0 +; GENERIC-NEXT: packssdw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_packssdw: +; ATOM: # BB#0: +; ATOM-NEXT: packssdw %xmm1, %xmm0 +; ATOM-NEXT: packssdw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_packssdw: +; SLM: # BB#0: +; SLM-NEXT: packssdw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: packssdw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_packssdw: +; SANDY: # BB#0: +; SANDY-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_packssdw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_packssdw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) + %2 = bitcast <8 x i16> %1 to <4 x i32> + %3 = load <4 x i32>, <4 x i32> *%a2, align 16 + %4 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %2, <4 x i32> %3) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone + +define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_packsswb: +; GENERIC: # BB#0: +; GENERIC-NEXT: packsswb %xmm1, %xmm0 +; GENERIC-NEXT: packsswb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_packsswb: +; ATOM: # BB#0: +; ATOM-NEXT: packsswb %xmm1, %xmm0 +; ATOM-NEXT: packsswb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_packsswb: +; SLM: # BB#0: +; SLM-NEXT: packsswb %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: packsswb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_packsswb: +; SANDY: # BB#0: +; SANDY-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_packsswb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_packsswb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = load <8 x i16>, <8 x i16> *%a2, align 16 + %4 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %2, <8 x i16> %3) + ret <16 x i8> %4 +} +declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_packuswb: +; GENERIC: # BB#0: +; GENERIC-NEXT: packuswb %xmm1, %xmm0 +; GENERIC-NEXT: packuswb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_packuswb: +; ATOM: # BB#0: +; ATOM-NEXT: packuswb %xmm1, %xmm0 +; ATOM-NEXT: packuswb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_packuswb: +; SLM: # BB#0: +; SLM-NEXT: packuswb %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: packuswb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_packuswb: +; SANDY: # BB#0: +; SANDY-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_packuswb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_packuswb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = load <8 x i16>, <8 x i16> *%a2, align 16 + %4 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %2, <8 x i16> %3) + ret <16 x i8> %4 +} +declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_paddb: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddb %xmm1, %xmm0 +; GENERIC-NEXT: paddb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddb: +; ATOM: # BB#0: +; ATOM-NEXT: paddb %xmm1, %xmm0 +; ATOM-NEXT: paddb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddb: +; SLM: # BB#0: +; SLM-NEXT: paddb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddb: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = add <16 x i8> %1, %2 + ret <16 x i8> %3 +} + +define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_paddd: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: paddd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddd: +; ATOM: # BB#0: +; ATOM-NEXT: paddd %xmm1, %xmm0 +; ATOM-NEXT: paddd (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddd: +; SLM: # BB#0: +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddd: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = add <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_paddq: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: paddq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddq: +; ATOM: # BB#0: +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: paddq (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddq: +; SLM: # BB#0: +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddq: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = add <2 x i64> %1, %2 + ret <2 x i64> %3 +} + +define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_paddsb: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddsb %xmm1, %xmm0 +; GENERIC-NEXT: paddsb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddsb: +; ATOM: # BB#0: +; ATOM-NEXT: paddsb %xmm1, %xmm0 +; ATOM-NEXT: paddsb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddsb: +; SLM: # BB#0: +; SLM-NEXT: paddsb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddsb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddsb: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddsb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddsb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_paddsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddsw %xmm1, %xmm0 +; GENERIC-NEXT: paddsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddsw: +; ATOM: # BB#0: +; ATOM-NEXT: paddsw %xmm1, %xmm0 +; ATOM-NEXT: paddsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddsw: +; SLM: # BB#0: +; SLM-NEXT: paddsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_paddusb: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddusb %xmm1, %xmm0 +; GENERIC-NEXT: paddusb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddusb: +; ATOM: # BB#0: +; ATOM-NEXT: paddusb %xmm1, %xmm0 +; ATOM-NEXT: paddusb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddusb: +; SLM: # BB#0: +; SLM-NEXT: paddusb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddusb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddusb: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddusb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddusb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_paddusw: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddusw %xmm1, %xmm0 +; GENERIC-NEXT: paddusw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddusw: +; ATOM: # BB#0: +; ATOM-NEXT: paddusw %xmm1, %xmm0 +; ATOM-NEXT: paddusw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddusw: +; SLM: # BB#0: +; SLM-NEXT: paddusw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddusw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddusw: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddusw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddusw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_paddw: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: paddw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddw: +; ATOM: # BB#0: +; ATOM-NEXT: paddw %xmm1, %xmm0 +; ATOM-NEXT: paddw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddw: +; SLM: # BB#0: +; SLM-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddw: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = add <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pand: +; GENERIC: # BB#0: +; GENERIC-NEXT: pand %xmm1, %xmm0 +; GENERIC-NEXT: pand (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pand: +; ATOM: # BB#0: +; ATOM-NEXT: pand %xmm1, %xmm0 +; ATOM-NEXT: pand (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pand: +; SLM: # BB#0: +; SLM-NEXT: pand %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pand (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pand: +; SANDY: # BB#0: +; SANDY-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pand: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pand: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = and <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = and <2 x i64> %1, %2 + %4 = add <2 x i64> %3, %a1 + ret <2 x i64> %4 +} + +define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pandn: +; GENERIC: # BB#0: +; GENERIC-NEXT: pandn %xmm1, %xmm0 +; GENERIC-NEXT: movdqa %xmm0, %xmm1 +; GENERIC-NEXT: pandn (%rdi), %xmm1 +; GENERIC-NEXT: paddq %xmm0, %xmm1 +; GENERIC-NEXT: movdqa %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pandn: +; ATOM: # BB#0: +; ATOM-NEXT: pandn %xmm1, %xmm0 +; ATOM-NEXT: movdqa %xmm0, %xmm1 +; ATOM-NEXT: pandn (%rdi), %xmm1 +; ATOM-NEXT: paddq %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pandn: +; SLM: # BB#0: +; SLM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pandn (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pandn: +; SANDY: # BB#0: +; SANDY-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pandn: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pandn: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = xor <2 x i64> %a0, + %2 = and <2 x i64> %a1, %1 + %3 = load <2 x i64>, <2 x i64> *%a2, align 16 + %4 = xor <2 x i64> %2, + %5 = and <2 x i64> %3, %4 + %6 = add <2 x i64> %2, %5 + ret <2 x i64> %6 +} + +define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pavgb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pavgb %xmm1, %xmm0 +; GENERIC-NEXT: pavgb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pavgb: +; ATOM: # BB#0: +; ATOM-NEXT: pavgb %xmm1, %xmm0 +; ATOM-NEXT: pavgb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pavgb: +; SLM: # BB#0: +; SLM-NEXT: pavgb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pavgb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pavgb: +; SANDY: # BB#0: +; SANDY-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pavgb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pavgb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone + +define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pavgw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pavgw %xmm1, %xmm0 +; GENERIC-NEXT: pavgw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pavgw: +; ATOM: # BB#0: +; ATOM-NEXT: pavgw %xmm1, %xmm0 +; ATOM-NEXT: pavgw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pavgw: +; SLM: # BB#0: +; SLM-NEXT: pavgw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pavgw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pavgw: +; SANDY: # BB#0: +; SANDY-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pavgw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pavgw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpeqb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpeqb %xmm0, %xmm1 +; GENERIC-NEXT: pcmpeqb (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpeqb: +; ATOM: # BB#0: +; ATOM-NEXT: pcmpeqb %xmm0, %xmm1 +; ATOM-NEXT: pcmpeqb (%rdi), %xmm0 +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpeqb: +; SLM: # BB#0: +; SLM-NEXT: pcmpeqb %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pcmpeqb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpeqb: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpeqb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpeqb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp eq <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = icmp eq <16 x i8> %a0, %2 + %4 = or <16 x i1> %1, %3 + %5 = sext <16 x i1> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pcmpeqd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpeqd %xmm0, %xmm1 +; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpeqd: +; ATOM: # BB#0: +; ATOM-NEXT: pcmpeqd %xmm0, %xmm1 +; ATOM-NEXT: pcmpeqd (%rdi), %xmm0 +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpeqd: +; SLM: # BB#0: +; SLM-NEXT: pcmpeqd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpeqd: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpeqd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpeqd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp eq <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = icmp eq <4 x i32> %a0, %2 + %4 = or <4 x i1> %1, %3 + %5 = sext <4 x i1> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pcmpeqw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpeqw %xmm0, %xmm1 +; GENERIC-NEXT: pcmpeqw (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpeqw: +; ATOM: # BB#0: +; ATOM-NEXT: pcmpeqw %xmm0, %xmm1 +; ATOM-NEXT: pcmpeqw (%rdi), %xmm0 +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpeqw: +; SLM: # BB#0: +; SLM-NEXT: pcmpeqw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pcmpeqw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpeqw: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpeqw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpeqw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp eq <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = icmp eq <8 x i16> %a0, %2 + %4 = or <8 x i1> %1, %3 + %5 = sext <8 x i1> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpgtb: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa %xmm0, %xmm2 +; GENERIC-NEXT: pcmpgtb %xmm1, %xmm2 +; GENERIC-NEXT: pcmpgtb (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpgtb: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa %xmm0, %xmm2 +; ATOM-NEXT: pcmpgtb (%rdi), %xmm0 +; ATOM-NEXT: pcmpgtb %xmm1, %xmm2 +; ATOM-NEXT: por %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpgtb: +; SLM: # BB#0: +; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: pcmpgtb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pcmpgtb %xmm1, %xmm2 # sched: [1:0.50] +; SLM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpgtb: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpgtb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpgtb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp sgt <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = icmp sgt <16 x i8> %a0, %2 + %4 = or <16 x i1> %1, %3 + %5 = sext <16 x i1> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pcmpgtd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa %xmm0, %xmm2 +; GENERIC-NEXT: pcmpgtd %xmm1, %xmm2 +; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpgtd: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa %xmm0, %xmm2 +; ATOM-NEXT: pcmpeqd (%rdi), %xmm0 +; ATOM-NEXT: pcmpgtd %xmm1, %xmm2 +; ATOM-NEXT: por %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpgtd: +; SLM: # BB#0: +; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pcmpgtd %xmm1, %xmm2 # sched: [1:0.50] +; SLM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpgtd: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpgtd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpgtd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp sgt <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = icmp eq <4 x i32> %a0, %2 + %4 = or <4 x i1> %1, %3 + %5 = sext <4 x i1> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pcmpgtw: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa %xmm0, %xmm2 +; GENERIC-NEXT: pcmpgtw %xmm1, %xmm2 +; GENERIC-NEXT: pcmpgtw (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpgtw: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa %xmm0, %xmm2 +; ATOM-NEXT: pcmpgtw (%rdi), %xmm0 +; ATOM-NEXT: pcmpgtw %xmm1, %xmm2 +; ATOM-NEXT: por %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpgtw: +; SLM: # BB#0: +; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: pcmpgtw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pcmpgtw %xmm1, %xmm2 # sched: [1:0.50] +; SLM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpgtw: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpgtw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpgtw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp sgt <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = icmp sgt <8 x i16> %a0, %2 + %4 = or <8 x i1> %1, %3 + %5 = sext <8 x i1> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define i16 @test_pextrw(<8 x i16> %a0) { +; GENERIC-LABEL: test_pextrw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextrw $6, %xmm0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pextrw: +; ATOM: # BB#0: +; ATOM-NEXT: pextrw $6, %xmm0, %eax +; ATOM-NEXT: # kill: %AX %AX %EAX +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pextrw: +; SLM: # BB#0: +; SLM-NEXT: pextrw $6, %xmm0, %eax # sched: [4:1.00] +; SLM-NEXT: # kill: %AX %AX %EAX +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pextrw: +; SANDY: # BB#0: +; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: # kill: %AX %AX %EAX +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pextrw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: # kill: %AX %AX %EAX +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pextrw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX %AX %EAX +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = extractelement <8 x i16> %a0, i32 6 + ret i16 %1 +} + +define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmaddwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaddwd %xmm1, %xmm0 +; GENERIC-NEXT: pmaddwd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmaddwd: +; ATOM: # BB#0: +; ATOM-NEXT: pmaddwd %xmm1, %xmm0 +; ATOM-NEXT: pmaddwd (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmaddwd: +; SLM: # BB#0: +; SLM-NEXT: pmaddwd %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmaddwd (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaddwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaddwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaddwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) + %2 = bitcast <4 x i32> %1 to <8 x i16> + %3 = load <8 x i16>, <8 x i16> *%a2, align 16 + %4 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %2, <8 x i16> %3) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmaxsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxsw %xmm1, %xmm0 +; GENERIC-NEXT: pmaxsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmaxsw: +; ATOM: # BB#0: +; ATOM-NEXT: pmaxsw %xmm1, %xmm0 +; ATOM-NEXT: pmaxsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmaxsw: +; SLM: # BB#0: +; SLM-NEXT: pmaxsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pmaxub: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxub %xmm1, %xmm0 +; GENERIC-NEXT: pmaxub (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmaxub: +; ATOM: # BB#0: +; ATOM-NEXT: pmaxub %xmm1, %xmm0 +; ATOM-NEXT: pmaxub (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmaxub: +; SLM: # BB#0: +; SLM-NEXT: pmaxub %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxub (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxub: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxub: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxub: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pminsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminsw %xmm1, %xmm0 +; GENERIC-NEXT: pminsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pminsw: +; ATOM: # BB#0: +; ATOM-NEXT: pminsw %xmm1, %xmm0 +; ATOM-NEXT: pminsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pminsw: +; SLM: # BB#0: +; SLM-NEXT: pminsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pminub: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminub %xmm1, %xmm0 +; GENERIC-NEXT: pminub (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pminub: +; ATOM: # BB#0: +; ATOM-NEXT: pminub %xmm1, %xmm0 +; ATOM-NEXT: pminub (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pminub: +; SLM: # BB#0: +; SLM-NEXT: pminub %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminub (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminub: +; SANDY: # BB#0: +; SANDY-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminub: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminub: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone + +define i32 @test_pmovmskb(<16 x i8> %a0) { +; GENERIC-LABEL: test_pmovmskb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovmskb %xmm0, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmovmskb: +; ATOM: # BB#0: +; ATOM-NEXT: pmovmskb %xmm0, %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmovmskb: +; SLM: # BB#0: +; SLM-NEXT: pmovmskb %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovmskb: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovmskb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovmskb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmulhuw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmulhuw %xmm1, %xmm0 +; GENERIC-NEXT: pmulhuw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmulhuw: +; ATOM: # BB#0: +; ATOM-NEXT: pmulhuw %xmm1, %xmm0 +; ATOM-NEXT: pmulhuw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmulhuw: +; SLM: # BB#0: +; SLM-NEXT: pmulhuw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmulhuw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmulhuw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmulhuw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmulhuw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmulhw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmulhw %xmm1, %xmm0 +; GENERIC-NEXT: pmulhw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmulhw: +; ATOM: # BB#0: +; ATOM-NEXT: pmulhw %xmm1, %xmm0 +; ATOM-NEXT: pmulhw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmulhw: +; SLM: # BB#0: +; SLM-NEXT: pmulhw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmulhw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmulhw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmulhw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmulhw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmullw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmullw %xmm1, %xmm0 +; GENERIC-NEXT: pmullw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmullw: +; ATOM: # BB#0: +; ATOM-NEXT: pmullw %xmm1, %xmm0 +; ATOM-NEXT: pmullw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmullw: +; SLM: # BB#0: +; SLM-NEXT: pmullw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmullw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmullw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmullw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmullw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = mul <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = mul <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pmuludq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmuludq %xmm1, %xmm0 +; GENERIC-NEXT: pmuludq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmuludq: +; ATOM: # BB#0: +; ATOM-NEXT: pmuludq %xmm1, %xmm0 +; ATOM-NEXT: pmuludq (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmuludq: +; SLM: # BB#0: +; SLM-NEXT: pmuludq %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmuludq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmuludq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmuludq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmuludq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = load <4 x i32>, <4 x i32> *%a2, align 16 + %4 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %2, <4 x i32> %3) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone + +define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_por: +; GENERIC: # BB#0: +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: por (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_por: +; ATOM: # BB#0: +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: por (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_por: +; SLM: # BB#0: +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: por (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_por: +; SANDY: # BB#0: +; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_por: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_por: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = or <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = or <2 x i64> %1, %2 + %4 = add <2 x i64> %3, %a1 + ret <2 x i64> %4 +} + +define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psadbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psadbw %xmm1, %xmm0 +; GENERIC-NEXT: psadbw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psadbw: +; ATOM: # BB#0: +; ATOM-NEXT: psadbw %xmm1, %xmm0 +; ATOM-NEXT: psadbw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psadbw: +; SLM: # BB#0: +; SLM-NEXT: psadbw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psadbw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psadbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psadbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psadbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) + %2 = bitcast <2 x i64> %1 to <16 x i8> + %3 = load <16 x i8>, <16 x i8> *%a2, align 16 + %4 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %2, <16 x i8> %3) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) { +; GENERIC-LABEL: test_pshufd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; GENERIC-NEXT: pshufd {{.*#+}} xmm0 = mem[3,2,1,0] +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pshufd: +; ATOM: # BB#0: +; ATOM-NEXT: pshufd {{.*#+}} xmm1 = mem[3,2,1,0] +; ATOM-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pshufd: +; SLM: # BB#0: +; SLM-NEXT: pshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [4:1.00] +; SLM-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pshufd: +; SANDY: # BB#0: +; SANDY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] +; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pshufd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00] +; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pshufd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %2 = load <4 x i32>, <4 x i32> *%a1, align 16 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) { +; GENERIC-LABEL: test_pshufhw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] +; GENERIC-NEXT: pshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,6,5,4] +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pshufhw: +; ATOM: # BB#0: +; ATOM-NEXT: pshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] +; ATOM-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; ATOM-NEXT: paddw %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pshufhw: +; SLM: # BB#0: +; SLM-NEXT: pshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [4:1.00] +; SLM-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00] +; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pshufhw: +; SANDY: # BB#0: +; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] +; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pshufhw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00] +; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00] +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pshufhw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [6:1.00] +; BTVER2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a1, align 16 + %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> + %4 = add <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) { +; GENERIC-LABEL: test_pshuflw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] +; GENERIC-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,2,1,0,4,5,6,7] +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pshuflw: +; ATOM: # BB#0: +; ATOM-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] +; ATOM-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; ATOM-NEXT: paddw %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pshuflw: +; SLM: # BB#0: +; SLM-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [4:1.00] +; SLM-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00] +; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pshuflw: +; SANDY: # BB#0: +; SANDY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] +; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pshuflw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00] +; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00] +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pshuflw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [6:1.00] +; BTVER2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a1, align 16 + %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> + %4 = add <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pslld: +; GENERIC: # BB#0: +; GENERIC-NEXT: pslld %xmm1, %xmm0 +; GENERIC-NEXT: pslld (%rdi), %xmm0 +; GENERIC-NEXT: pslld $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pslld: +; ATOM: # BB#0: +; ATOM-NEXT: pslld %xmm1, %xmm0 +; ATOM-NEXT: pslld (%rdi), %xmm0 +; ATOM-NEXT: pslld $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pslld: +; SLM: # BB#0: +; SLM-NEXT: pslld %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: pslld (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pslld $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pslld: +; SANDY: # BB#0: +; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pslld: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pslld: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2) + %4 = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone + +define <4 x i32> @test_pslldq(<4 x i32> %a0) { +; GENERIC-LABEL: test_pslldq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pslldq: +; ATOM: # BB#0: +; ATOM-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pslldq: +; SLM: # BB#0: +; SLM-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pslldq: +; SANDY: # BB#0: +; SANDY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pslldq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pslldq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %1 +} + +define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_psllq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psllq %xmm1, %xmm0 +; GENERIC-NEXT: psllq (%rdi), %xmm0 +; GENERIC-NEXT: psllq $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psllq: +; ATOM: # BB#0: +; ATOM-NEXT: psllq %xmm1, %xmm0 +; ATOM-NEXT: psllq (%rdi), %xmm0 +; ATOM-NEXT: psllq $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psllq: +; SLM: # BB#0: +; SLM-NEXT: psllq %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psllq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psllq $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psllq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psllq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psllq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2) + %4 = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %3, i32 2) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone + +define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psllw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psllw %xmm1, %xmm0 +; GENERIC-NEXT: psllw (%rdi), %xmm0 +; GENERIC-NEXT: psllw $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psllw: +; ATOM: # BB#0: +; ATOM-NEXT: psllw %xmm1, %xmm0 +; ATOM-NEXT: psllw (%rdi), %xmm0 +; ATOM-NEXT: psllw $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psllw: +; SLM: # BB#0: +; SLM-NEXT: psllw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psllw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psllw $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psllw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psllw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psllw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2) + %4 = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone + +define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_psrad: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrad %xmm1, %xmm0 +; GENERIC-NEXT: psrad (%rdi), %xmm0 +; GENERIC-NEXT: psrad $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrad: +; ATOM: # BB#0: +; ATOM-NEXT: psrad %xmm1, %xmm0 +; ATOM-NEXT: psrad (%rdi), %xmm0 +; ATOM-NEXT: psrad $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrad: +; SLM: # BB#0: +; SLM-NEXT: psrad %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrad (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrad $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrad: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrad: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrad: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2) + %4 = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone + +define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psraw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psraw %xmm1, %xmm0 +; GENERIC-NEXT: psraw (%rdi), %xmm0 +; GENERIC-NEXT: psraw $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psraw: +; ATOM: # BB#0: +; ATOM-NEXT: psraw %xmm1, %xmm0 +; ATOM-NEXT: psraw (%rdi), %xmm0 +; ATOM-NEXT: psraw $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psraw: +; SLM: # BB#0: +; SLM-NEXT: psraw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psraw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psraw $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psraw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psraw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psraw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2) + %4 = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone + +define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_psrld: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrld %xmm1, %xmm0 +; GENERIC-NEXT: psrld (%rdi), %xmm0 +; GENERIC-NEXT: psrld $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrld: +; ATOM: # BB#0: +; ATOM-NEXT: psrld %xmm1, %xmm0 +; ATOM-NEXT: psrld (%rdi), %xmm0 +; ATOM-NEXT: psrld $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrld: +; SLM: # BB#0: +; SLM-NEXT: psrld %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrld (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrld $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrld: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrld: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrld: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2) + %4 = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone + +define <4 x i32> @test_psrldq(<4 x i32> %a0) { +; GENERIC-LABEL: test_psrldq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrldq: +; ATOM: # BB#0: +; ATOM-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrldq: +; SLM: # BB#0: +; SLM-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrldq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrldq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrldq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %1 +} + +define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_psrlq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrlq %xmm1, %xmm0 +; GENERIC-NEXT: psrlq (%rdi), %xmm0 +; GENERIC-NEXT: psrlq $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrlq: +; ATOM: # BB#0: +; ATOM-NEXT: psrlq %xmm1, %xmm0 +; ATOM-NEXT: psrlq (%rdi), %xmm0 +; ATOM-NEXT: psrlq $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrlq: +; SLM: # BB#0: +; SLM-NEXT: psrlq %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrlq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrlq $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrlq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrlq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrlq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2) + %4 = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %3, i32 2) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone +declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone + +define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psrlw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrlw %xmm1, %xmm0 +; GENERIC-NEXT: psrlw (%rdi), %xmm0 +; GENERIC-NEXT: psrlw $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrlw: +; ATOM: # BB#0: +; ATOM-NEXT: psrlw %xmm1, %xmm0 +; ATOM-NEXT: psrlw (%rdi), %xmm0 +; ATOM-NEXT: psrlw $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrlw: +; SLM: # BB#0: +; SLM-NEXT: psrlw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrlw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrlw $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrlw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrlw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrlw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2) + %4 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone + +define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psubb: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubb %xmm1, %xmm0 +; GENERIC-NEXT: psubb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubb: +; ATOM: # BB#0: +; ATOM-NEXT: psubb %xmm1, %xmm0 +; ATOM-NEXT: psubb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubb: +; SLM: # BB#0: +; SLM-NEXT: psubb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubb: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = sub <16 x i8> %1, %2 + ret <16 x i8> %3 +} + +define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_psubd: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubd %xmm1, %xmm0 +; GENERIC-NEXT: psubd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubd: +; ATOM: # BB#0: +; ATOM-NEXT: psubd %xmm1, %xmm0 +; ATOM-NEXT: psubd (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubd: +; SLM: # BB#0: +; SLM-NEXT: psubd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubd: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = sub <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_psubq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubq %xmm1, %xmm0 +; GENERIC-NEXT: psubq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubq: +; ATOM: # BB#0: +; ATOM-NEXT: psubq %xmm1, %xmm0 +; ATOM-NEXT: psubq (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubq: +; SLM: # BB#0: +; SLM-NEXT: psubq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = sub <2 x i64> %1, %2 + ret <2 x i64> %3 +} + +define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psubsb: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubsb %xmm1, %xmm0 +; GENERIC-NEXT: psubsb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubsb: +; ATOM: # BB#0: +; ATOM-NEXT: psubsb %xmm1, %xmm0 +; ATOM-NEXT: psubsb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubsb: +; SLM: # BB#0: +; SLM-NEXT: psubsb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubsb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubsb: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubsb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubsb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psubsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubsw %xmm1, %xmm0 +; GENERIC-NEXT: psubsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubsw: +; ATOM: # BB#0: +; ATOM-NEXT: psubsw %xmm1, %xmm0 +; ATOM-NEXT: psubsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubsw: +; SLM: # BB#0: +; SLM-NEXT: psubsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psubusb: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubusb %xmm1, %xmm0 +; GENERIC-NEXT: psubusb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubusb: +; ATOM: # BB#0: +; ATOM-NEXT: psubusb %xmm1, %xmm0 +; ATOM-NEXT: psubusb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubusb: +; SLM: # BB#0: +; SLM-NEXT: psubusb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubusb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubusb: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubusb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubusb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psubusw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubusw %xmm1, %xmm0 +; GENERIC-NEXT: psubusw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubusw: +; ATOM: # BB#0: +; ATOM-NEXT: psubusw %xmm1, %xmm0 +; ATOM-NEXT: psubusw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubusw: +; SLM: # BB#0: +; SLM-NEXT: psubusw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubusw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubusw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubusw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubusw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psubw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubw %xmm1, %xmm0 +; GENERIC-NEXT: psubw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubw: +; ATOM: # BB#0: +; ATOM-NEXT: psubw %xmm1, %xmm0 +; ATOM-NEXT: psubw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubw: +; SLM: # BB#0: +; SLM-NEXT: psubw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = sub <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_punpckhbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhbw: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhbw: +; SLM: # BB#0: +; SLM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00] +; SLM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] +; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + ret <16 x i8> %3 +} + +define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_punpckhdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhdq: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ATOM-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; ATOM-NEXT: paddd %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhdq: +; SLM: # BB#0: +; SLM-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SLM-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [4:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_punpckhqdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhqdq: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ATOM-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhqdq: +; SLM: # BB#0: +; SLM-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SLM-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhqdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhqdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhqdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> + %4 = add <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_punpckhwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhwd: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhwd: +; SLM: # BB#0: +; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ret <8 x i16> %3 +} + +define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_punpcklbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpcklbw: +; ATOM: # BB#0: +; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpcklbw: +; SLM: # BB#0: +; SLM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; SLM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpcklbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpcklbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpcklbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + ret <16 x i8> %3 +} + +define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_punpckldq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckldq: +; ATOM: # BB#0: +; ATOM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; ATOM-NEXT: paddd %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckldq: +; SLM: # BB#0: +; SLM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SLM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [4:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckldq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckldq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckldq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_punpcklqdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpcklqdq: +; ATOM: # BB#0: +; ATOM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpcklqdq: +; SLM: # BB#0: +; SLM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SLM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpcklqdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpcklqdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpcklqdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> + %4 = add <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_punpcklwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpcklwd: +; ATOM: # BB#0: +; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpcklwd: +; SLM: # BB#0: +; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpcklwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpcklwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpcklwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ret <8 x i16> %3 +} + +define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pxor: +; GENERIC: # BB#0: +; GENERIC-NEXT: pxor %xmm1, %xmm0 +; GENERIC-NEXT: pxor (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pxor: +; ATOM: # BB#0: +; ATOM-NEXT: pxor %xmm1, %xmm0 +; ATOM-NEXT: pxor (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pxor: +; SLM: # BB#0: +; SLM-NEXT: pxor %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pxor (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pxor: +; SANDY: # BB#0: +; SANDY-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pxor: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pxor: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = xor <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = xor <2 x i64> %1, %2 + %4 = add <2 x i64> %3, %a1 + ret <2 x i64> %4 +} + +define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_shufpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; GENERIC-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_shufpd: +; ATOM: # BB#0: +; ATOM-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; ATOM-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_shufpd: +; SLM: # BB#0: +; SLM-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] +; SLM-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_shufpd: +; SANDY: # BB#0: +; SANDY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] +; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_shufpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_shufpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_sqrtpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: sqrtpd %xmm0, %xmm1 +; GENERIC-NEXT: sqrtpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_sqrtpd: +; ATOM: # BB#0: +; ATOM-NEXT: sqrtpd %xmm0, %xmm1 +; ATOM-NEXT: sqrtpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_sqrtpd: +; SLM: # BB#0: +; SLM-NEXT: sqrtpd (%rdi), %xmm1 # sched: [18:1.00] +; SLM-NEXT: sqrtpd %xmm0, %xmm0 # sched: [15:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_sqrtpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00] +; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00] +; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:21.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2) + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +; TODO - sqrtsd_m + +define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_sqrtsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: sqrtsd %xmm0, %xmm0 +; GENERIC-NEXT: movapd (%rdi), %xmm1 +; GENERIC-NEXT: sqrtsd %xmm1, %xmm1 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_sqrtsd: +; ATOM: # BB#0: +; ATOM-NEXT: movapd (%rdi), %xmm1 +; ATOM-NEXT: sqrtsd %xmm0, %xmm0 +; ATOM-NEXT: sqrtsd %xmm1, %xmm1 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_sqrtsd: +; SLM: # BB#0: +; SLM-NEXT: movapd (%rdi), %xmm1 # sched: [3:1.00] +; SLM-NEXT: sqrtsd %xmm0, %xmm0 # sched: [18:1.00] +; SLM-NEXT: sqrtsd %xmm1, %xmm1 # sched: [18:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_sqrtsd: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00] +; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50] +; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00] +; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50] +; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovapd (%rdi), %xmm1 # sched: [5:1.00] +; BTVER2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} +declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + +define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_subpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: subpd %xmm1, %xmm0 +; GENERIC-NEXT: subpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_subpd: +; ATOM: # BB#0: +; ATOM-NEXT: subpd %xmm1, %xmm0 +; ATOM-NEXT: subpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_subpd: +; SLM: # BB#0: +; SLM-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: subpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_subpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fsub <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fsub <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_subsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_subsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: subsd %xmm1, %xmm0 +; GENERIC-NEXT: subsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_subsd: +; ATOM: # BB#0: +; ATOM-NEXT: subsd %xmm1, %xmm0 +; ATOM-NEXT: subsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_subsd: +; SLM: # BB#0: +; SLM-NEXT: subsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: subsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_subsd: +; SANDY: # BB#0: +; SANDY-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fsub double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fsub double %1, %2 + ret double %3 +} + +define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_ucomisd: +; GENERIC: # BB#0: +; GENERIC-NEXT: ucomisd %xmm1, %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %cl +; GENERIC-NEXT: andb %al, %cl +; GENERIC-NEXT: ucomisd (%rdi), %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %dl +; GENERIC-NEXT: andb %al, %dl +; GENERIC-NEXT: orb %cl, %dl +; GENERIC-NEXT: movzbl %dl, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_ucomisd: +; ATOM: # BB#0: +; ATOM-NEXT: ucomisd %xmm1, %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %cl +; ATOM-NEXT: andb %al, %cl +; ATOM-NEXT: ucomisd (%rdi), %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %dl +; ATOM-NEXT: andb %al, %dl +; ATOM-NEXT: orb %cl, %dl +; ATOM-NEXT: movzbl %dl, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_ucomisd: +; SLM: # BB#0: +; SLM-NEXT: ucomisd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %cl # sched: [1:0.50] +; SLM-NEXT: andb %al, %cl # sched: [1:0.50] +; SLM-NEXT: ucomisd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %dl # sched: [1:0.50] +; SLM-NEXT: andb %al, %dl # sched: [1:0.50] +; SLM-NEXT: orb %cl, %dl # sched: [1:0.50] +; SLM-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ucomisd: +; SANDY: # BB#0: +; SANDY-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] +; SANDY-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] +; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] +; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_ucomisd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %cl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] +; HASWELL-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %dl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] +; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] +; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ucomisd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %cl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] +; BTVER2-NEXT: vucomisd (%rdi), %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %dl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %dl # sched: [1:0.50] +; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] +; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 8 + %3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2) + %4 = or i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_unpckhpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; GENERIC-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_unpckhpd: +; ATOM: # BB#0: +; ATOM-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ATOM-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_unpckhpd: +; SLM: # BB#0: +; SLM-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SLM-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_unpckhpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpckhpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpckhpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_unpcklpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; GENERIC-NEXT: movapd %xmm0, %xmm1 +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; GENERIC-NEXT: addpd %xmm0, %xmm1 +; GENERIC-NEXT: movapd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_unpcklpd: +; ATOM: # BB#0: +; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ATOM-NEXT: movapd %xmm0, %xmm1 +; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movapd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_unpcklpd: +; SLM: # BB#0: +; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] +; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_unpcklpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpcklpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpcklpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_xorpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: xorpd %xmm1, %xmm0 +; GENERIC-NEXT: xorpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_xorpd: +; ATOM: # BB#0: +; ATOM-NEXT: xorpd %xmm1, %xmm0 +; ATOM-NEXT: xorpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_xorpd: +; SLM: # BB#0: +; SLM-NEXT: xorpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: xorpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_xorpd: +; SANDY: # BB#0: +; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_xorpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_xorpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = xor <4 x i32> %1, %2 + %4 = load <2 x double>, <2 x double> *%a2, align 16 + %5 = bitcast <2 x double> %4 to <4 x i32> + %6 = xor <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <2 x double> + %8 = fadd <2 x double> %a1, %7 + ret <2 x double> %8 +} + +!0 = !{i32 1} diff --git a/test/CodeGen/X86/tail-merge-after-mbp.ll b/test/CodeGen/X86/tail-merge-after-mbp.ll deleted file mode 100644 index dc5f3a12bd91..000000000000 --- a/test/CodeGen/X86/tail-merge-after-mbp.ll +++ /dev/null @@ -1,94 +0,0 @@ -; RUN: llc -mtriple=x86_64-linux -o - %s | FileCheck %s - -%0 = type { %1, %3* } -%1 = type { %2* } -%2 = type { %2*, i8* } -%3 = type { i32, i32 (i32, i32)* } - - -declare i32 @Up(...) -declare i32 @f(i32, i32) - -; check loop block_14 is not merged with block_21 -; check loop block_11 is not merged with block_18, block_25 -define i32 @foo(%0* nocapture readonly, i32, i1 %c, i8* %p1, %2** %p2) { -; CHECK-LABEL: foo: -; CHECK: # %block_11 -; CHECK-NEXT: movq (%r14), %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: je -; CHECK-NEXT:# %block_14 -; CHECK-NEXT: cmpq $0, 8(%rax) -; CHECK-NEXT: jne -; CHECK-NEXT:# %block_18 -; CHECK-NEXT: movq (%r14), %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: je -; CHECK-NEXT:# %block_21 -; CHECK-NEXT:# =>This Inner Loop Header -; CHECK-NEXT: cmpq $0, 8(%rax) -; CHECK-NEXT: jne -; CHECK-NEXT:# %block_25 -; CHECK-NEXT:# in Loop -; CHECK-NEXT: movq (%r14), %rax -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: jne - br i1 %c, label %block_34, label %block_3 - -block_3: ; preds = %2 - br i1 %c, label %block_7, label %block_4 - -block_4: ; preds = %block_3 - %a5 = tail call i32 @f(i32 undef, i32 undef) - %a6 = icmp eq i32 %a5, 0 - br i1 %a6, label %block_7, label %block_34 - -block_7: ; preds = %block_4, %block_3 - %a8 = icmp eq %2* null, null - br i1 %a8, label %block_34, label %block_9 - -block_9: ; preds = %block_7 - %a10 = icmp eq i8* %p1, null - br i1 %a10, label %block_11, label %block_32 - -block_11: ; preds = %block_9 - %a12 = load %2*, %2** %p2, align 8 - %a13 = icmp eq %2* %a12, null - br i1 %a13, label %block_34, label %block_14 - -block_14: ; preds = %block_11 - %a15 = getelementptr inbounds %2, %2* %a12, i64 0, i32 1 - %a16 = load i8*, i8** %a15, align 8 - %a17 = icmp eq i8* %a16, null - br i1 %a17, label %block_18, label %block_32 - -block_18: ; preds = %block_14 - %a19 = load %2*, %2** %p2, align 8 - %a20 = icmp eq %2* %a19, null - br i1 %a20, label %block_34, label %block_21 - -block_21: ; preds = %block_18 - %a22 = getelementptr inbounds %2, %2* %a19, i64 0, i32 1 - %a23 = load i8*, i8** %a22, align 8 - %a24 = icmp eq i8* %a23, null - br i1 %a24, label %block_25, label %block_32 - -block_25: ; preds = %block_28, %block_21 - %a26 = load %2*, %2** %p2, align 8 - %a27 = icmp eq %2* %a26, null - br i1 %a27, label %block_34, label %block_28 - -block_28: ; preds = %block_25 - %a29 = getelementptr inbounds %2, %2* %a26, i64 0, i32 1 - %a30 = load i8*, i8** %a29, align 8 - %a31 = icmp eq i8* %a30, null - br i1 %a31, label %block_25, label %block_32 - -block_32: ; preds = %block_28, %block_21, %block_14, %block_9 - %a33 = tail call i32 (...) @Up() - br label %block_34 - -block_34: ; preds = %block_32, %block_25, %block_18, %block_11, %block_7, %block_4, %2 - %a35 = phi i32 [ 0, %2 ], [ %a5, %block_4 ], [ 0, %block_7 ], [ 0, %block_11 ], [ 0, %block_32 ], [ 0, %block_18 ], [ 0, %block_25 ] - ret i32 %a35 -} diff --git a/test/CodeGen/X86/tail-merge-after-mbp.mir b/test/CodeGen/X86/tail-merge-after-mbp.mir new file mode 100644 index 000000000000..d1dc65336948 --- /dev/null +++ b/test/CodeGen/X86/tail-merge-after-mbp.mir @@ -0,0 +1,105 @@ +# RUN: llc -mtriple=x86_64-linux -run-pass=block-placement -o - %s | FileCheck %s + +--- +# check loop bb.7 is not merged with bb.10, bb.13 +# check loop bb.9 is not merged with bb.12 +# CHECK: bb.2: +# CHECK-NEXT: successors: %bb.9(0x30000000), %bb.3(0x50000000) +# CHECK: %rax = MOV64rm %r14, 1, _, 0, _ +# CHECK-NEXT: TEST64rr %rax, %rax +# CHECK-NEXT: JE_1 %bb.9 +# CHECK: bb.3: +# CHECK-NEXT: successors: %bb.4(0x30000000), %bb.8(0x50000000) +# CHECK: CMP64mi8 killed %rax, 1, _, 8, _, 0 +# CHECK-NEXT: JNE_1 %bb.8 +# CHECK: bb.4: +# CHECK-NEXT: successors: %bb.9(0x30000000), %bb.5(0x50000000) +# CHECK: %rax = MOV64rm %r14, 1, _, 0, _ +# CHECK-NEXT: TEST64rr %rax, %rax +# CHECK-NEXT: JE_1 %bb.9 +# CHECK: bb.5 +# CHECK-NEXT: successors: %bb.6(0x71555555), %bb.8(0x0eaaaaab) +# CHECK: CMP64mi8 killed %rax, 1, _, 8, _, 0 +# CHECK-NEXT: JNE_1 %bb.8 +# CHECK: bb.6: +# CHECK-NEXT: successors: %bb.9(0x04000000), %bb.5(0x7c000000) +# CHECK: %rax = MOV64rm %r14, 1, _, 0, _ +# CHECK-NEXT: TEST64rr %rax, %rax +# CHECK-NEXT: JNE_1 %bb.5 + +name: foo +body: | + bb.0: + successors: %bb.1(0x40000000), %bb.7(0x40000000) + + TEST8ri %dl, 1, implicit-def %eflags, implicit killed %edx + JE_1 %bb.7, implicit %eflags + + bb.1: + successors: %bb.16(0x80000000) + + %ebp = XOR32rr undef %ebp, undef %ebp, implicit-def dead %eflags + JMP_1 %bb.16 + + bb.7: + successors: %bb.8(0x30000000), %bb.9(0x50000000) + + %rax = MOV64rm %r14, 1, _, 0, _ :: (load 8) + TEST64rr %rax, %rax, implicit-def %eflags + JNE_1 %bb.9, implicit killed %eflags + + bb.8: + successors: %bb.16(0x80000000) + + %ebp = XOR32rr undef %ebp, undef %ebp, implicit-def dead %eflags + JMP_1 %bb.16 + + bb.9: + successors: %bb.10(0x30000000), %bb.15(0x50000000) + + CMP64mi8 killed %rax, 1, _, 8, _, 0, implicit-def %eflags :: (load 8) + JNE_1 %bb.15, implicit %eflags + + bb.10: + successors: %bb.11(0x30000000), %bb.12(0x50000000) + + %rax = MOV64rm %r14, 1, _, 0, _ :: (load 8) + TEST64rr %rax, %rax, implicit-def %eflags + JNE_1 %bb.12, implicit %eflags + + bb.11: + successors: %bb.16(0x80000000) + + %ebp = XOR32rr undef %ebp, undef %ebp, implicit-def dead %eflags + JMP_1 %bb.16 + + bb.12: + successors: %bb.13(0x71555555), %bb.15(0x0eaaaaab) + + CMP64mi8 killed %rax, 1, _, 8, _, 0, implicit-def %eflags :: (load 8), (load 8) + JNE_1 %bb.15, implicit %eflags + + bb.13: + successors: %bb.14(0x04000000), %bb.12(0x7c000000) + + %rax = MOV64rm %r14, 1, _, 0, _ :: (load 8) + TEST64rr %rax, %rax, implicit-def %eflags + JNE_1 %bb.12, implicit %eflags + + bb.14: + successors: %bb.16(0x80000000) + + %ebp = XOR32rr undef %ebp, undef %ebp, implicit-def dead %eflags + JMP_1 %bb.16 + + bb.15: + successors: %bb.16(0x80000000) + + %ebp = XOR32rr undef %ebp, undef %ebp, implicit-def dead %eflags + dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al + + bb.16: + + RETQ %eax + +... diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll index 5eb1a55881e5..852c1f4d3d98 100644 --- a/test/CodeGen/X86/vector-rotate-128.ll +++ b/test/CodeGen/X86/vector-rotate-128.ll @@ -1534,31 +1534,20 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { ; SSE-LABEL: splatconstant_rotate_mask_v2i64: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psllq $15, %xmm1 ; SSE-NEXT: psrlq $49, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_rotate_mask_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vpsllq $15, %xmm0, %xmm1 ; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: splatconstant_rotate_mask_v2i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $15, %xmm0, %xmm1 ; AVX512-NEXT: vpsrlq $49, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v2i64: diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll index 3306cd400c1d..14215e486bf9 100644 --- a/test/CodeGen/X86/vector-rotate-256.ll +++ b/test/CodeGen/X86/vector-rotate-256.ll @@ -1014,34 +1014,23 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: splatconstant_rotate_mask_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllq $15, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsllq $15, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $49, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpsrlq $49, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatconstant_rotate_mask_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsllq $15, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_rotate_mask_v4i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $15, %ymm0, %ymm1 ; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: diff --git a/test/CodeGen/X86/x86-16.ll b/test/CodeGen/X86/x86-16.ll index 775b2c447bbd..55b53a8047c5 100644 --- a/test/CodeGen/X86/x86-16.ll +++ b/test/CodeGen/X86/x86-16.ll @@ -12,9 +12,16 @@ define i32 @main() #0 { ; CHECK: .code16 ; CHECK-LABEL: main +define i64 @foo(i32 %index) #0 { + %asm = tail call i64 asm "rdmsr", "=A,{cx},~{dirflag},~{fpsr},~{flags}"(i32 %index) + ret i64 %asm +} + +; CHECK-LABEL: foo +; CHECK: rdmsr attributes #0 = { nounwind } !llvm.ident = !{!0} -!0 = !{!"clang version 3.9.0 (trunk 265439) (llvm/trunk 265567)"} \ No newline at end of file +!0 = !{!"clang version 3.9.0 (trunk 265439) (llvm/trunk 265567)"} -- cgit v1.3