diff options
Diffstat (limited to 'test/CodeGen/X86')
80 files changed, 8779 insertions, 1104 deletions
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll index 7da85d3a9a1d0..fa71bffaf8c64 100644 --- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll +++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -mattr=+cmov | FileCheck %s +; RUN: llc < %s -march=x86 -mattr=+cmov -x86-cmov-converter=false | FileCheck %s ; ; Test scheduling a multi-use compare. We should neither spill flags ; nor clone the compare. diff --git a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll index 8d387136da9c8..37f01845db799 100644 --- a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll +++ b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s ; CHECK-NOT: -{{[1-9][0-9]*}}(%rsp) -define x86_64_win64cc x86_fp80 @a(i64 %x) nounwind readnone { +define win64cc x86_fp80 @a(i64 %x) nounwind readnone { entry: %conv = sitofp i64 %x to x86_fp80 ; <x86_fp80> [#uses=1] ret x86_fp80 %conv diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll index ba5de8eb5fcb7..e812cbe3270ad 100644 --- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -83,10 +83,11 @@ define void @full_test() { ; X32-NEXT: cmpeqps %xmm2, %xmm1 ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm4 -; X32-NEXT: extractps $1, %xmm4, {{[0-9]+}}(%esp) ; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; diff --git a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll index 9dff4e596caa3..72807922a22b1 100644 --- a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll +++ b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck --check-prefix=CHECK %s +; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s declare x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0); diff --git a/test/CodeGen/X86/alias-static-alloca.ll b/test/CodeGen/X86/alias-static-alloca.ll new file mode 100644 index 0000000000000..f4ca7e39f4fcb --- /dev/null +++ b/test/CodeGen/X86/alias-static-alloca.ll @@ -0,0 +1,37 @@ +; RUN: llc -o - -mtriple=x86_64-linux-gnu %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; We should be able to bypass the load values to their corresponding +; stores here. + +; CHECK-LABEL: foo +; CHECK-DAG: movl %esi, -8(%rsp) +; CHECK-DAG: movl %ecx, -16(%rsp) +; CHECK-DAG: movl %edi, -4(%rsp) +; CHECK-DAG: movl %edx, -12(%rsp) +; CHECK: leal +; CHECK: addl +; CHECK: addl +; CHECK: retq + +define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + %a0 = alloca i32 + %a1 = alloca i32 + %a2 = alloca i32 + %a3 = alloca i32 + store i32 %b, i32* %a1 + store i32 %d, i32* %a3 + store i32 %a, i32* %a0 + store i32 %c, i32* %a2 + %l0 = load i32, i32* %a0 + %l1 = load i32, i32* %a1 + %l2 = load i32, i32* %a2 + %l3 = load i32, i32* %a3 + %add0 = add nsw i32 %l0, %l1 + %add1 = add nsw i32 %add0, %l2 + %add2 = add nsw i32 %add1, %l3 + ret i32 %add2 +} diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll index d5d3fa6db5e83..1a6fde371f09c 100644 --- a/test/CodeGen/X86/atomic-minmax-i6432.ll +++ b/test/CodeGen/X86/atomic-minmax-i6432.ll @@ -9,32 +9,32 @@ define void @atomic_maxmin_i6432() { ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] %2 = atomicrmw min i64* @sc64, i64 6 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] %3 = atomicrmw umax i64* @sc64, i64 7 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] %4 = atomicrmw umin i64* @sc64, i64 8 acquire ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]] ; LINUX: cmpl ; LINUX: sbbl -; LINUX: cmovne -; LINUX: cmovne +; LINUX: jne +; LINUX: jne ; LINUX: lock cmpxchg8b ; LINUX: jne [[LABEL]] ret void diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll index 77bbdec826a59..c6300708bcc1a 100644 --- a/test/CodeGen/X86/atomic128.ll +++ b/test/CodeGen/X86/atomic128.ll @@ -167,14 +167,24 @@ define void @fetch_and_min(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %rdx, %rcx ; CHECK-NEXT: setge %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB5_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB5_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB5_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB5_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB5_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx @@ -203,14 +213,24 @@ define void @fetch_and_max(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %r8, %rcx ; CHECK-NEXT: setge %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB6_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB6_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB6_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB6_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB6_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx @@ -239,14 +259,24 @@ define void @fetch_and_umin(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %rdx, %rcx ; CHECK-NEXT: setae %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB7_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB7_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB7_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB7_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB7_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx @@ -275,14 +305,24 @@ define void @fetch_and_umax(i128* %p, i128 %bits) { ; CHECK-NEXT: sbbq %rdx, %rcx ; CHECK-NEXT: setb %cl ; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: jne LBB8_3 +; CHECK-NEXT: ## BB#2: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: cmovneq %rax, %rbx +; CHECK-NEXT: LBB8_3: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: jne LBB8_5 +; CHECK-NEXT: ## BB#4: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: movq %r8, %rcx -; CHECK-NEXT: cmovneq %rdx, %rcx +; CHECK-NEXT: LBB8_5: ## %atomicrmw.start +; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1 ; CHECK-NEXT: lock cmpxchg16b (%rdi) ; CHECK-NEXT: jne LBB8_1 -; CHECK-NEXT: ## BB#2: ## %atomicrmw.end +; CHECK-NEXT: ## BB#6: ## %atomicrmw.end ; CHECK-NEXT: movq %rax, {{.*}}(%rip) ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip) ; CHECK-NEXT: popq %rbx diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index a12a412fb94d6..953f3bdd06e87 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -27,9 +27,9 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_addpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fadd <4 x double> %1, %2 @@ -57,9 +57,9 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_addps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fadd <8 x float> %1, %2 @@ -87,9 +87,9 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ; ZNVER1-LABEL: test_addsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2) @@ -118,9 +118,9 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ; ZNVER1-LABEL: test_addsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2) @@ -152,10 +152,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ; ZNVER1-LABEL: test_andnotpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1> @@ -193,10 +193,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ; ZNVER1-LABEL: test_andnotps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1> @@ -234,10 +234,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_andpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = and <4 x i64> %1, %2 @@ -273,10 +273,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_andps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = and <4 x i64> %1, %2 @@ -313,9 +313,9 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl ; ZNVER1-LABEL: test_blendpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fadd <4 x double> %a1, %1 @@ -345,8 +345,8 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> * ; ZNVER1-LABEL: test_blendps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] -; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7> @@ -374,9 +374,9 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ; ZNVER1-LABEL: test_blendvpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) %2 = load <4 x double>, <4 x double> *%a3, align 32 %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2) @@ -405,9 +405,9 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ; ZNVER1-LABEL: test_blendvps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) %2 = load <8 x float>, <8 x float> *%a3, align 32 %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2) @@ -433,8 +433,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) { ; ; ZNVER1-LABEL: test_broadcastf128: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 32 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> ret <8 x float> %2 @@ -458,8 +458,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) { ; ; ZNVER1-LABEL: test_broadcastsd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load double, double *%a0, align 8 %2 = insertelement <4 x double> undef, double %1, i32 0 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer @@ -484,8 +484,8 @@ define <4 x float> @test_broadcastss(float *%a0) { ; ; ZNVER1-LABEL: test_broadcastss: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float *%a0, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer @@ -510,8 +510,8 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) { ; ; ZNVER1-LABEL: test_broadcastss_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float *%a0, align 4 %2 = insertelement <8 x float> undef, float %1, i32 0 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer @@ -543,9 +543,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ZNVER1-LABEL: test_cmppd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fcmp oeq <4 x double> %a0, %2 @@ -581,9 +581,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ZNVER1-LABEL: test_cmpps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fcmp oeq <8 x float> %a0, %2 @@ -618,10 +618,10 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { ; ; ZNVER1-LABEL: test_cvtdq2pd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <4 x i32> %a0 to <4 x double> %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = sitofp <4 x i32> %2 to <4 x double> @@ -655,10 +655,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) { ; ; ZNVER1-LABEL: test_cvtdq2ps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <8 x i32> %a0 to <8 x float> %2 = load <8 x i32>, <8 x i32> *%a1, align 16 %3 = sitofp <8 x i32> %2 to <8 x float> @@ -690,10 +690,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_cvtpd2dq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [5:1.00] ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <4 x double> %a0 to <4 x i32> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = fptosi <4 x double> %2 to <4 x i32> @@ -725,10 +725,10 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_cvtpd2ps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00] ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptrunc <4 x double> %a0 to <4 x float> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = fptrunc <4 x double> %2 to <4 x float> @@ -760,10 +760,10 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_cvtps2dq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <8 x float> %a0 to <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = fptosi <8 x float> %2 to <8 x i32> @@ -792,9 +792,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_divpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00] -; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fdiv <4 x double> %1, %2 @@ -822,9 +822,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_divps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00] -; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fdiv <8 x float> %1, %2 @@ -853,8 +853,8 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2 ; ZNVER1-LABEL: test_dpps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7) @@ -886,9 +886,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa ; ZNVER1-LABEL: test_extractf128: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:0.50] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> store <4 x float> %2, <4 x float> *%a2 @@ -916,9 +916,9 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ; ZNVER1-LABEL: test_haddpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2) @@ -947,9 +947,9 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ; ZNVER1-LABEL: test_haddps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2) @@ -978,9 +978,9 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ; ZNVER1-LABEL: test_hsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2) @@ -1009,9 +1009,9 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ; ZNVER1-LABEL: test_hsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2) @@ -1044,9 +1044,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float ; ZNVER1-LABEL: test_insertf128: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] -; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> %3 = load <4 x float>, <4 x float> *%a2, align 16 @@ -1074,8 +1074,8 @@ define <32 x i8> @test_lddqu(i8* %a0) { ; ; ZNVER1-LABEL: test_lddqu: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ret <32 x i8> %1 } @@ -1108,7 +1108,7 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) { ; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1) call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) ret <2 x double> %1 @@ -1143,7 +1143,7 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2 ; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1) call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2) ret <4 x double> %1 @@ -1178,7 +1178,7 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) { ; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1) call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) ret <4 x float> %1 @@ -1213,7 +1213,7 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) ; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1) call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2) ret <8 x float> %1 @@ -1243,8 +1243,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ZNVER1-LABEL: test_maxpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2) @@ -1274,8 +1274,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ZNVER1-LABEL: test_maxps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) @@ -1305,8 +1305,8 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ZNVER1-LABEL: test_minpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2) @@ -1336,8 +1336,8 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ZNVER1-LABEL: test_minps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) @@ -1369,10 +1369,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movapd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x double>, <4 x double> *%a0, align 32 %2 = fadd <4 x double> %1, %1 store <4 x double> %2, <4 x double> *%a1, align 32 @@ -1403,10 +1403,10 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movaps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x float>, <8 x float> *%a0, align 32 %2 = fadd <8 x float> %1, %1 store <8 x float> %2, <8 x float> *%a1, align 32 @@ -1437,10 +1437,10 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movddup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] +; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [8:0.50] ; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> @@ -1468,9 +1468,9 @@ define i32 @test_movmskpd(<4 x double> %a0) { ; ; ZNVER1-LABEL: test_movmskpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ret i32 %1 } @@ -1496,9 +1496,9 @@ define i32 @test_movmskps(<8 x float> %a0) { ; ; ZNVER1-LABEL: test_movmskps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ret i32 %1 } @@ -1525,9 +1525,9 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movntpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x double> %a0, %a0 store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0 ret <4 x double> %1 @@ -1554,9 +1554,9 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movntps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <8 x float> %a0, %a0 store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0 ret <8 x float> %1 @@ -1586,10 +1586,10 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movshdup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] +; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [8:0.50] ; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> @@ -1621,10 +1621,10 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movsldup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] +; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [8:0.50] ; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> @@ -1658,10 +1658,10 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_movupd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x double>, <4 x double> *%a0, align 1 %2 = fadd <4 x double> %1, %1 store <4 x double> %2, <4 x double> *%a1, align 1 @@ -1694,10 +1694,10 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_movups: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x float>, <8 x float> *%a0, align 1 %2 = fadd <8 x float> %1, %1 store <8 x float> %2, <8 x float> *%a1, align 1 @@ -1725,9 +1725,9 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_mulpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00] -; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fmul <4 x double> %1, %2 @@ -1755,9 +1755,9 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_mulps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fmul <8 x float> %1, %2 @@ -1788,10 +1788,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) ; ; ZNVER1-LABEL: orpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = or <4 x i64> %1, %2 @@ -1827,10 +1827,10 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2 ; ; ZNVER1-LABEL: test_orps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = or <4 x i64> %1, %2 @@ -1866,10 +1866,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) { ; ; ZNVER1-LABEL: test_permilpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [8:0.50] ; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50] ; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0> %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 1, i32 0> @@ -1901,10 +1901,10 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_permilpd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:0.50] ; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3> @@ -1936,10 +1936,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) { ; ; ZNVER1-LABEL: test_permilps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50] ; ZNVER1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50] ; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> @@ -1971,10 +1971,10 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_permilps_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:0.50] ; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> @@ -2004,8 +2004,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64> ; ZNVER1-LABEL: test_permilvarpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2) @@ -2035,8 +2035,8 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x ; ZNVER1-LABEL: test_permilvarpd_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2) @@ -2066,8 +2066,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> * ; ZNVER1-LABEL: test_permilvarps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2) @@ -2097,8 +2097,8 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3 ; ZNVER1-LABEL: test_permilvarps_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2) @@ -2130,10 +2130,10 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_rcpps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00] -; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2) @@ -2166,10 +2166,10 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_roundpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00] ; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7) @@ -2202,10 +2202,10 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_roundps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:1.00] ; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7) @@ -2238,10 +2238,10 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_rsqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00] -; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2) @@ -2275,9 +2275,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ZNVER1-LABEL: test_shufpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] -; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 4, i32 2, i32 7> @@ -2307,8 +2307,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ZNVER1-LABEL: test_shufps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50] -; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 3, i32 8, i32 8, i32 4, i32 7, i32 12, i32 12> @@ -2339,10 +2339,10 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) { ; ; ZNVER1-LABEL: test_sqrtpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00] -; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2) @@ -2375,10 +2375,10 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) { ; ; ZNVER1-LABEL: test_sqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00] -; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2) @@ -2408,9 +2408,9 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_subpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fsub <4 x double> %1, %2 @@ -2438,9 +2438,9 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_subps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fsub <8 x float> %1, %2 @@ -2477,12 +2477,12 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; ; ZNVER1-LABEL: test_testpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2) @@ -2523,13 +2523,13 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a ; ; ZNVER1-LABEL: test_testpd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2) @@ -2568,12 +2568,12 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; ; ZNVER1-LABEL: test_testps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2) @@ -2614,13 +2614,13 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) ; ; ZNVER1-LABEL: test_testps_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2) @@ -2654,9 +2654,9 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ZNVER1-LABEL: test_unpckhpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] -; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> @@ -2686,8 +2686,8 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ZNVER1-LABEL: test_unpckhps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50] -; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> @@ -2719,9 +2719,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ZNVER1-LABEL: test_unpcklpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] -; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> @@ -2751,8 +2751,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ZNVER1-LABEL: test_unpcklps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50] -; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> @@ -2783,10 +2783,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; ZNVER1-LABEL: test_xorpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = xor <4 x i64> %1, %2 @@ -2822,10 +2822,10 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; ZNVER1-LABEL: test_xorps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = xor <4 x i64> %1, %2 @@ -2856,7 +2856,7 @@ define void @test_zeroall() { ; ZNVER1-LABEL: test_zeroall: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vzeroall # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.avx.vzeroall() ret void } @@ -2881,7 +2881,7 @@ define void @test_zeroupper() { ; ZNVER1-LABEL: test_zeroupper: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.avx.vzeroupper() ret void } diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll index 017f54b40b2d5..9918d66802564 100644 --- a/test/CodeGen/X86/avx2-arith.ll +++ b/test/CodeGen/X86/avx2-arith.ll @@ -386,13 +386,13 @@ define <8 x i32> @mul_const9(<8 x i32> %x) { define <4 x i32> @mul_const10(<4 x i32> %x) { ; X32-LABEL: mul_const10: ; X32: # BB#0: -; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009] ; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: mul_const10: ; X64: # BB#0: -; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009] ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009> @@ -403,13 +403,13 @@ define <4 x i32> @mul_const10(<4 x i32> %x) { define <4 x i32> @mul_const11(<4 x i32> %x) { ; X32-LABEL: mul_const11: ; X32: # BB#0: -; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152] ; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: mul_const11: ; X64: # BB#0: -; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152] ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152> diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll index 042bc217b97cf..a3862d7e27c66 100644 --- a/test/CodeGen/X86/avx2-schedule.ll +++ b/test/CodeGen/X86/avx2-schedule.ll @@ -13,10 +13,10 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) { ; ; ZNVER1-LABEL: test_pabsb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) %2 = load <32 x i8>, <32 x i8> *%a1, align 32 %3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2) @@ -35,10 +35,10 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) { ; ; ZNVER1-LABEL: test_pabsd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) %2 = load <8 x i32>, <8 x i32> *%a1, align 32 %3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2) @@ -57,10 +57,10 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) { ; ; ZNVER1-LABEL: test_pabsw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) %2 = load <16 x i16>, <16 x i16> *%a1, align 32 %3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2) @@ -78,9 +78,9 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; ; ZNVER1-LABEL: test_paddb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <32 x i8> %a0, %a1 %2 = load <32 x i8>, <32 x i8> *%a2, align 32 %3 = add <32 x i8> %1, %2 @@ -96,9 +96,9 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; ; ZNVER1-LABEL: test_paddd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = add <8 x i32> %1, %2 @@ -114,9 +114,9 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_paddq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = add <4 x i64> %1, %2 @@ -132,9 +132,9 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; ; ZNVER1-LABEL: test_paddw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = add <16 x i16> %1, %2 @@ -151,10 +151,10 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_pand: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = and <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = and <4 x i64> %1, %2 @@ -172,10 +172,10 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_pandn: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1> %2 = and <4 x i64> %a1, %1 %3 = load <4 x i64>, <4 x i64> *%a2, align 32 @@ -194,9 +194,9 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; ; ZNVER1-LABEL: test_pmulld: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = mul <8 x i32> %1, %2 @@ -212,9 +212,9 @@ define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) ; ; ZNVER1-LABEL: test_pmullw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = mul <16 x i16> %1, %2 @@ -231,10 +231,10 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_por: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = or <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = or <4 x i64> %1, %2 @@ -251,9 +251,9 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) { ; ; ZNVER1-LABEL: test_psubb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <32 x i8> %a0, %a1 %2 = load <32 x i8>, <32 x i8> *%a2, align 32 %3 = sub <32 x i8> %1, %2 @@ -269,9 +269,9 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) { ; ; ZNVER1-LABEL: test_psubd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = sub <8 x i32> %1, %2 @@ -287,9 +287,9 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_psubq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = sub <4 x i64> %1, %2 @@ -305,9 +305,9 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) { ; ; ZNVER1-LABEL: test_psubw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = sub <16 x i16> %1, %2 @@ -324,10 +324,10 @@ define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; ZNVER1-LABEL: test_pxor: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = xor <4 x i64> %1, %2 diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll index 127726ea30da1..c77714b9e181a 100644 --- a/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/test/CodeGen/X86/avx2-vector-shifts.ll @@ -376,7 +376,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X32: # BB#0: ; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm2 +; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X32-NEXT: vzeroupper @@ -386,7 +386,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X64: # BB#0: ; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index 140299f5495dc..e10a781fabc21 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -1507,7 +1507,7 @@ define <4 x float> @uitofp_4i1_float(<4 x i32> %a) { ; NOVL: # BB#0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NOVL-NEXT: retq ; diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index e1a92c60d1825..6f4bf061a2157 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1630,8 +1630,9 @@ define void @f1(i32 %c) { ; CHECK-LABEL: f1: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: movzbl {{.*}}(%rip), %edi -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorb $1, %al +; CHECK-NEXT: movb {{.*}}(%rip), %al +; CHECK-NEXT: notb %al +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movb %al, {{.*}}(%rip) ; CHECK-NEXT: xorl $1, %edi ; CHECK-NEXT: jmp _f2 ## TAILCALL diff --git a/test/CodeGen/X86/avx512-rotate.ll b/test/CodeGen/X86/avx512-rotate.ll new file mode 100644 index 0000000000000..98fa67ad793d9 --- /dev/null +++ b/test/CodeGen/X86/avx512-rotate.ll @@ -0,0 +1,256 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX + +declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +; Tests showing replacement of variable rotates with immediate splat versions. + +define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_rol_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprold $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprold $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_rol_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprold $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprold $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_rol_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprolq $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprolq $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_rol_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprolq $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprolq $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_ror_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprord $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprord $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_ror_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprord $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprord $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_ror_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprorq $5, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprorq $5, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_ror_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprorq $5, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprorq $5, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +; Tests showing replacement of out-of-bounds variable rotates with in-bounds immediate splat versions. + +define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_bounds_rol_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprold $1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprold $30, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_rol_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprold $1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprold $30, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_bounds_rol_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprolq $62, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprolq $63, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_rol_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprolq $62, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprolq $63, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { +; KNL-LABEL: test_splat_bounds_ror_v16i32: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprord $1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprord $30, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_ror_v16i32: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprord $1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprord $30, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { +; KNL-LABEL: test_splat_bounds_ror_v8i64: +; KNL: # BB#0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vprorq $62, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z} +; KNL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vprorq $63, %zmm0, %zmm0 +; KNL-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_splat_bounds_ror_v8i64: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vprorq $62, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z} +; SKX-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vprorq $63, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + +; Constant folding + +define <8 x i64> @test_fold_rol_v8i64() { +; CHECK-LABEL: test_fold_rol_v8i64: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808] +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_fold_ror_v8i64() { +; CHECK-LABEL: test_fold_ror_v8i64: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,9223372036854775808,4611686018427387904,2,9223372036854775808,4,2,2] +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll index 10883a5a9a625..ce2b010ec0f27 100644 --- a/test/CodeGen/X86/avx512-shift.ll +++ b/test/CodeGen/X86/avx512-shift.ll @@ -1,136 +1,178 @@ -;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL +;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX -;CHECK-LABEL: shift_16_i32 -;CHECK: vpsrld -;CHECK: vpslld -;CHECK: vpsrad -;CHECK: ret define <16 x i32> @shift_16_i32(<16 x i32> %a) { +; CHECK-LABEL: shift_16_i32: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrld $1, %zmm0, %zmm0 +; CHECK-NEXT: vpslld $12, %zmm0, %zmm0 +; CHECK-NEXT: vpsrad $12, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = lshr <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %c = shl <16 x i32> %b, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> %d = ashr <16 x i32> %c, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> ret <16 x i32> %d; } -;CHECK-LABEL: shift_8_i64 -;CHECK: vpsrlq -;CHECK: vpsllq -;CHECK: vpsraq -;CHECK: ret define <8 x i64> @shift_8_i64(<8 x i64> %a) { +; CHECK-LABEL: shift_8_i64: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlq $1, %zmm0, %zmm0 +; CHECK-NEXT: vpsllq $12, %zmm0, %zmm0 +; CHECK-NEXT: vpsraq $12, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = lshr <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> %c = shl <8 x i64> %b, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12> %d = ashr <8 x i64> %c, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12> ret <8 x i64> %d; } -;SKX-LABEL: shift_4_i64 -;SKX: vpsrlq -;SKX: vpsllq -;SKX: vpsraq -;SKX: ret define <4 x i64> @shift_4_i64(<4 x i64> %a) { +; KNL-LABEL: shift_4_i64: +; KNL: # BB#0: +; KNL-NEXT: vpsrlq $1, %ymm0, %ymm0 +; KNL-NEXT: vpsllq $12, %ymm0, %ymm0 +; KNL-NEXT: vpsraq $12, %zmm0, %zmm0 +; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; KNL-NEXT: retq +; +; SKX-LABEL: shift_4_i64: +; SKX: # BB#0: +; SKX-NEXT: vpsrlq $1, %ymm0, %ymm0 +; SKX-NEXT: vpsllq $12, %ymm0, %ymm0 +; SKX-NEXT: vpsraq $12, %ymm0, %ymm0 +; SKX-NEXT: retq %b = lshr <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> %c = shl <4 x i64> %b, <i64 12, i64 12, i64 12, i64 12> %d = ashr <4 x i64> %c, <i64 12, i64 12, i64 12, i64 12> ret <4 x i64> %d; } -; CHECK-LABEL: variable_shl4 -; CHECK: vpsllvq %zmm -; CHECK: ret define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) { +; CHECK-LABEL: variable_shl4: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = shl <8 x i64> %x, %y ret <8 x i64> %k } -; CHECK-LABEL: variable_shl5 -; CHECK: vpsllvd %zmm -; CHECK: ret define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) { +; CHECK-LABEL: variable_shl5: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = shl <16 x i32> %x, %y ret <16 x i32> %k } -; CHECK-LABEL: variable_srl0 -; CHECK: vpsrlvd -; CHECK: ret define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) { +; CHECK-LABEL: variable_srl0: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = lshr <16 x i32> %x, %y ret <16 x i32> %k } -; CHECK-LABEL: variable_srl2 -; CHECK: psrlvq -; CHECK: ret define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) { +; CHECK-LABEL: variable_srl2: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = lshr <8 x i64> %x, %y ret <8 x i64> %k } -; CHECK-LABEL: variable_sra1 -; CHECK: vpsravd -; CHECK: ret define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) { +; CHECK-LABEL: variable_sra1: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = ashr <16 x i32> %x, %y ret <16 x i32> %k } -; CHECK-LABEL: variable_sra2 -; CHECK: vpsravq %zmm -; CHECK: ret define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) { +; CHECK-LABEL: variable_sra2: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %k = ashr <8 x i64> %x, %y ret <8 x i64> %k } -; SKX-LABEL: variable_sra3 -; SKX: vpsravq %ymm -; SKX: ret define <4 x i64> @variable_sra3(<4 x i64> %x, <4 x i64> %y) { +; KNL-LABEL: variable_sra3: +; KNL: # BB#0: +; KNL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; KNL-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; KNL-NEXT: retq +; +; SKX-LABEL: variable_sra3: +; SKX: # BB#0: +; SKX-NEXT: vpsravq %ymm1, %ymm0, %ymm0 +; SKX-NEXT: retq %k = ashr <4 x i64> %x, %y ret <4 x i64> %k } -; SKX-LABEL: variable_sra4 -; SKX: vpsravw %xmm -; SKX: ret define <8 x i16> @variable_sra4(<8 x i16> %x, <8 x i16> %y) { +; KNL-LABEL: variable_sra4: +; KNL: # BB#0: +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; KNL-NEXT: vpmovsxwd %xmm0, %ymm0 +; KNL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; KNL-NEXT: retq +; +; SKX-LABEL: variable_sra4: +; SKX: # BB#0: +; SKX-NEXT: vpsravw %xmm1, %xmm0, %xmm0 +; SKX-NEXT: retq %k = ashr <8 x i16> %x, %y ret <8 x i16> %k } -; CHECK-LABEL: variable_sra01_load -; CHECK: vpsravd (% -; CHECK: ret define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) { +; CHECK-LABEL: variable_sra01_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsravd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <16 x i32>, <16 x i32>* %y %k = ashr <16 x i32> %x, %y1 ret <16 x i32> %k } -; CHECK-LABEL: variable_shl1_load -; CHECK: vpsllvd (% -; CHECK: ret define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) { +; CHECK-LABEL: variable_shl1_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsllvd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <16 x i32>, <16 x i32>* %y %k = shl <16 x i32> %x, %y1 ret <16 x i32> %k } -; CHECK: variable_srl0_load -; CHECK: vpsrlvd (% -; CHECK: ret + define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) { +; CHECK-LABEL: variable_srl0_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <16 x i32>, <16 x i32>* %y %k = lshr <16 x i32> %x, %y1 ret <16 x i32> %k } -; CHECK: variable_srl3_load -; CHECK: vpsrlvq (% -; CHECK: ret define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) { +; CHECK-LABEL: variable_srl3_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpsrlvq (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %y1 = load <8 x i64>, <8 x i64>* %y %k = lshr <8 x i64> %x, %y1 ret <8 x i64> %k diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll new file mode 100644 index 0000000000000..75be2d9c0f01e --- /dev/null +++ b/test/CodeGen/X86/bmi-schedule.ll @@ -0,0 +1,529 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) { +; GENERIC-LABEL: test_andn_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnl %esi, %edi, %eax +; GENERIC-NEXT: notl %edi +; GENERIC-NEXT: andw (%rdx), %di +; GENERIC-NEXT: addl %edi, %eax +; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_andn_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: notl %edi # sched: [1:0.25] +; HASWELL-NEXT: andw (%rdx), %di # sched: [5:0.50] +; HASWELL-NEXT: addl %edi, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andn_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50] +; BTVER2-NEXT: notl %edi # sched: [1:0.50] +; BTVER2-NEXT: andw (%rdx), %di # sched: [4:1.00] +; BTVER2-NEXT: addl %edi, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnl %esi, %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: notl %edi # sched: [1:0.25] +; ZNVER1-NEXT: andw (%rdx), %di # sched: [5:0.50] +; ZNVER1-NEXT: addl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a2 + %2 = xor i16 %a0, -1 + %3 = and i16 %2, %a1 + %4 = and i16 %2, %1 + %5 = add i16 %3, %4 + ret i16 %5 +} + +define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_andn_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnl %esi, %edi, %ecx +; GENERIC-NEXT: andnl (%rdx), %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_andn_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50] +; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [4:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andn_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: andnl (%rdx), %edi, %eax # sched: [4:1.00] +; BTVER2-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnl (%rdx), %edi, %eax # sched: [5:0.50] +; ZNVER1-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.25] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = xor i32 %a0, -1 + %3 = and i32 %2, %a1 + %4 = and i32 %2, %1 + %5 = add i32 %3, %4 + ret i32 %5 +} + +define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_andn_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnq %rsi, %rdi, %rcx +; GENERIC-NEXT: andnq (%rdx), %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_andn_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50] +; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andn_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:1.00] +; BTVER2-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnq (%rdx), %rdi, %rax # sched: [5:0.50] +; ZNVER1-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.25] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = xor i64 %a0, -1 + %3 = and i64 %2, %a1 + %4 = and i64 %2, %1 + %5 = add i64 %3, %4 + ret i64 %5 +} + +define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_bextr_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx +; GENERIC-NEXT: bextrl %edi, %esi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bextr_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:0.50] +; HASWELL-NEXT: bextrl %edi, %esi, %eax # sched: [2:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_bextr_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bextr_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0) + %3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.bextr.32(i32, i32) + +define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_bextr_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx +; GENERIC-NEXT: bextrq %rdi, %rsi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bextr_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:0.50] +; HASWELL-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_bextr_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bextr_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0) + %3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.bextr.64(i64, i64) + +define i32 @test_blsi_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_blsi_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsil (%rsi), %ecx +; GENERIC-NEXT: blsil %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsi_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsil %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsi_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsi_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = sub i32 0, %1 + %3 = sub i32 0, %a0 + %4 = and i32 %1, %2 + %5 = and i32 %a0, %3 + %6 = add i32 %4, %5 + ret i32 %6 +} + +define i64 @test_blsi_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_blsi_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsiq (%rsi), %rcx +; GENERIC-NEXT: blsiq %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsi_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsiq %rdi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsi_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsi_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = sub i64 0, %1 + %3 = sub i64 0, %a0 + %4 = and i64 %1, %2 + %5 = and i64 %a0, %3 + %6 = add i64 %4, %5 + ret i64 %6 +} + +define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_blsmsk_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsmskl (%rsi), %ecx +; GENERIC-NEXT: blsmskl %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsmsk_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsmskl %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsmsk_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsmsk_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = sub i32 %1, 1 + %3 = sub i32 %a0, 1 + %4 = xor i32 %1, %2 + %5 = xor i32 %a0, %3 + %6 = add i32 %4, %5 + ret i32 %6 +} + +define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_blsmsk_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsmskq (%rsi), %rcx +; GENERIC-NEXT: blsmskq %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsmsk_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsmskq %rdi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsmsk_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsmsk_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = sub i64 %1, 1 + %3 = sub i64 %a0, 1 + %4 = xor i64 %1, %2 + %5 = xor i64 %a0, %3 + %6 = add i64 %4, %5 + ret i64 %6 +} + +define i32 @test_blsr_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_blsr_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsrl (%rsi), %ecx +; GENERIC-NEXT: blsrl %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsr_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsrl %edi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsr_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsr_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = sub i32 %1, 1 + %3 = sub i32 %a0, 1 + %4 = and i32 %1, %2 + %5 = and i32 %a0, %3 + %6 = add i32 %4, %5 + ret i32 %6 +} + +define i64 @test_blsr_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_blsr_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: blsrq (%rsi), %rcx +; GENERIC-NEXT: blsrq %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_blsr_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsrq %rdi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blsr_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsr_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = sub i64 %1, 1 + %3 = sub i64 %a0, 1 + %4 = and i64 %1, %2 + %5 = and i64 %a0, %3 + %6 = add i64 %4, %5 + ret i64 %6 +} + +define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) { +; GENERIC-LABEL: test_cttz_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: tzcntw (%rsi), %cx +; GENERIC-NEXT: tzcntw %di, %ax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_cttz_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [7:1.00] +; HASWELL-NEXT: tzcntw %di, %ax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cttz_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00] +; BTVER2-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a1 + %2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false ) + %3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false ) + %4 = or i16 %2, %3 + ret i16 %4 +} +declare i16 @llvm.cttz.i16(i16, i1) + +define i32 @test_cttz_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_cttz_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: tzcntl (%rsi), %ecx +; GENERIC-NEXT: tzcntl %edi, %eax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_cttz_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [7:1.00] +; HASWELL-NEXT: tzcntl %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cttz_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00] +; BTVER2-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false ) + %3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false ) + %4 = or i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.cttz.i32(i32, i1) + +define i64 @test_cttz_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_cttz_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: tzcntq (%rsi), %rcx +; GENERIC-NEXT: tzcntq %rdi, %rax +; GENERIC-NEXT: orq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_cttz_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [7:1.00] +; HASWELL-NEXT: tzcntq %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cttz_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00] +; BTVER2-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00] +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false ) + %3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false ) + %4 = or i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.cttz.i64(i64, i1) diff --git a/test/CodeGen/X86/bmi2-schedule.ll b/test/CodeGen/X86/bmi2-schedule.ll new file mode 100644 index 0000000000000..9666dd85d8535 --- /dev/null +++ b/test/CodeGen/X86/bmi2-schedule.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_bzhi_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx +; GENERIC-NEXT: bzhil %edi, %esi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bzhi_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [4:0.50] +; HASWELL-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_bzhi_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bzhil %edi, (%rdx), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bzhil %edi, %esi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0) + %3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) + +define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_bzhi_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx +; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_bzhi_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50] +; HASWELL-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_bzhi_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bzhiq %rdi, %rsi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0) + %3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) + +define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_pdep_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: pdepl (%rdx), %edi, %ecx +; GENERIC-NEXT: pdepl %esi, %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pdep_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [7:1.00] +; HASWELL-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pdep_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pdepl (%rdx), %edi, %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pdepl %esi, %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1) + %3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.pdep.32(i32, i32) + +define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_pdep_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: pdepq (%rdx), %rdi, %rcx +; GENERIC-NEXT: pdepq %rsi, %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pdep_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [7:1.00] +; HASWELL-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pdep_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pdepq %rsi, %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1) + %3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.pdep.64(i64, i64) + +define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_pext_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextl (%rdx), %edi, %ecx +; GENERIC-NEXT: pextl %esi, %edi, %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pext_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [7:1.00] +; HASWELL-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pext_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pextl (%rdx), %edi, %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pextl %esi, %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a2 + %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1) + %3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1) + %4 = add i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.x86.bmi.pext.32(i32, i32) + +define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_pext_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextq (%rdx), %rdi, %rcx +; GENERIC-NEXT: pextq %rsi, %rdi, %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_pext_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [7:1.00] +; HASWELL-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; ZNVER1-LABEL: test_pext_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: pextq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: pextq %rsi, %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a2 + %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1) + %3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1) + %4 = add i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.x86.bmi.pext.64(i64, i64) diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll index e292ccd0be11d..7c1042878d591 100644 --- a/test/CodeGen/X86/bool-ext-inc.ll +++ b/test/CodeGen/X86/bool-ext-inc.ll @@ -19,7 +19,7 @@ define i32 @sext_inc(i1 zeroext %x) nounwind { define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> @@ -31,7 +31,7 @@ define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: cmpgt_sext_inc_vec: ; CHECK: # BB#0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i32> %x, %y @@ -56,7 +56,7 @@ define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: cmpgt_sext_inc_vec256: ; CHECK: # BB#0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] ; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i64> %x, %y @@ -91,7 +91,7 @@ define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp1 = icmp ne <4 x i32> %a, %b diff --git a/test/CodeGen/X86/bswap-rotate.ll b/test/CodeGen/X86/bswap-rotate.ll new file mode 100644 index 0000000000000..f686febe5645d --- /dev/null +++ b/test/CodeGen/X86/bswap-rotate.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +; Combine BSWAP (lowered to rolw 8) with a second rotate. +; This test checks for combining rotates with inconsistent constant value types. + +define i16 @combine_bswap_rotate(i16 %a0) { +; X86-LABEL: combine_bswap_rotate: +; X86: # BB#0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rolw $9, %ax +; X86-NEXT: retl +; +; X64-LABEL: combine_bswap_rotate: +; X64: # BB#0: +; X64-NEXT: rolw $9, %di +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq + %1 = call i16 @llvm.bswap.i16(i16 %a0) + %2 = shl i16 %1, 1 + %3 = lshr i16 %1, 15 + %4 = or i16 %2, %3 + ret i16 %4 +} + +declare i16 @llvm.bswap.i16(i16) diff --git a/test/CodeGen/X86/clobber-fi0.ll b/test/CodeGen/X86/clobber-fi0.ll index 02f1a1616db2d..b69b18531601a 100644 --- a/test/CodeGen/X86/clobber-fi0.ll +++ b/test/CodeGen/X86/clobber-fi0.ll @@ -15,22 +15,22 @@ bb: %tmp = alloca i32, align 4 ; [#uses=3 type=i32*] %tmp2 = alloca i32, align 4 ; [#uses=3 type=i32*] %tmp3 = alloca i32 ; [#uses=1 type=i32*] - store i32 1, i32* %tmp, align 4 - store i32 1, i32* %tmp2, align 4 + store volatile i32 1, i32* %tmp, align 4 + store volatile i32 1, i32* %tmp2, align 4 br label %bb4 bb4: ; preds = %bb4, %bb - %tmp6 = load i32, i32* %tmp2, align 4 ; [#uses=1 type=i32] + %tmp6 = load volatile i32, i32* %tmp2, align 4 ; [#uses=1 type=i32] %tmp7 = add i32 %tmp6, -1 ; [#uses=2 type=i32] - store i32 %tmp7, i32* %tmp2, align 4 + store volatile i32 %tmp7, i32* %tmp2, align 4 %tmp8 = icmp eq i32 %tmp7, 0 ; [#uses=1 type=i1] - %tmp9 = load i32, i32* %tmp ; [#uses=1 type=i32] + %tmp9 = load volatile i32, i32* %tmp ; [#uses=1 type=i32] %tmp10 = add i32 %tmp9, -1 ; [#uses=1 type=i32] - store i32 %tmp10, i32* %tmp3 + store volatile i32 %tmp10, i32* %tmp3 br i1 %tmp8, label %bb11, label %bb4 bb11: ; preds = %bb4 - %tmp12 = load i32, i32* %tmp, align 4 ; [#uses=1 type=i32] + %tmp12 = load volatile i32, i32* %tmp, align 4 ; [#uses=1 type=i32] ret i32 %tmp12 } diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll index 713ee5d0f65a9..0d74c937af33e 100644 --- a/test/CodeGen/X86/combine-rotates.ll +++ b/test/CodeGen/X86/combine-rotates.ll @@ -6,22 +6,12 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) { ; XOP-LABEL: combine_vec_rot_rot: ; XOP: # BB#0: -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: combine_vec_rot_rot: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4> %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28> @@ -40,12 +30,7 @@ define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) { ; ; AVX512-LABEL: combine_vec_rot_rot_splat: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrld $3, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $29, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrld $22, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $10, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vprold $7, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29> @@ -63,12 +48,6 @@ define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) { ; ; AVX512-LABEL: combine_vec_rot_rot_splat_zero: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrld $31, %xmm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll index 3dbff2680c22f..a6491a0a86940 100644 --- a/test/CodeGen/X86/combine-shl.ll +++ b/test/CodeGen/X86/combine-shl.ll @@ -392,7 +392,7 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_gt_lshr0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -437,7 +437,7 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_le_lshr0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816] ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -481,7 +481,7 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_ashr0: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> @@ -515,7 +515,7 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) { ; AVX-LABEL: combine_vec_shl_add0: ; AVX: # BB#0: ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> @@ -550,7 +550,7 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_or0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -585,7 +585,7 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_mul0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 21564cdd73530..473fae19f4fd6 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -91,7 +91,7 @@ define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_lshr_known_zero1: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq @@ -326,7 +326,7 @@ define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_lshr_shl_mask0: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823] ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> @@ -376,10 +376,10 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_lshr_lzcnt_bit0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $4, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll index e1e849929405a..b6ae2fa6d1578 100644 --- a/test/CodeGen/X86/combine-udiv.ll +++ b/test/CodeGen/X86/combine-udiv.ll @@ -166,7 +166,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll index 91da268a8d75a..4c7716bbaebed 100644 --- a/test/CodeGen/X86/combine-urem.ll +++ b/test/CodeGen/X86/combine-urem.ll @@ -43,7 +43,7 @@ define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) { ; ; AVX2-LABEL: combine_vec_urem_by_pow2a: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = urem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> @@ -87,7 +87,7 @@ define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_pow2c: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] ; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 @@ -146,7 +146,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_pow2d: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 @@ -183,7 +183,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_shl_pow2a: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] ; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll new file mode 100644 index 0000000000000..15ae4a49d7d32 --- /dev/null +++ b/test/CodeGen/X86/f16c-schedule.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=IVY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) { +; IVY-LABEL: test_vcvtph2ps_128: +; IVY: # BB#0: +; IVY-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00] +; IVY-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00] +; IVY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtph2ps_128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtph2ps_128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtph2ps_128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load <8 x i16>, <8 x i16> *%a1 + %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1) + %3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) + %4 = fadd <4 x float> %2, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) + +define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) { +; IVY-LABEL: test_vcvtph2ps_256: +; IVY: # BB#0: +; IVY-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00] +; IVY-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00] +; IVY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtph2ps_256: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00] +; HASWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [4:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtph2ps_256: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtph2ps_256: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load <8 x i16>, <8 x i16> *%a1 + %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1) + %3 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) + %4 = fadd <8 x float> %2, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) + +define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16> *%a2) { +; IVY-LABEL: test_vcvtps2ph_128: +; IVY: # BB#0: +; IVY-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00] +; IVY-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [7:1.00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtps2ph_128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtps2ph_128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtps2ph_128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) + %2 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a1, i32 0) + %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + store <4 x i16> %3, <4 x i16> *%a2 + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) + +define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16> *%a2) { +; IVY-LABEL: test_vcvtps2ph_256: +; IVY: # BB#0: +; IVY-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00] +; IVY-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00] +; IVY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; IVY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_vcvtps2ph_256: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_vcvtps2ph_256: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_vcvtps2ph_256: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [12:1.00] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) + %2 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a1, i32 0) + store <8 x i16> %2, <8 x i16> *%a2 + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll index 3d5c12c03484f..c87353ed1f5ad 100644 --- a/test/CodeGen/X86/fast-isel-x86-64.ll +++ b/test/CodeGen/X86/fast-isel-x86-64.ll @@ -316,7 +316,7 @@ define void @allocamaterialize() { ; STDERR-NOT: FastISel missed terminator: ret void ; CHECK-LABEL: win64ccfun -define x86_64_win64cc void @win64ccfun(i32 %i) { +define win64cc void @win64ccfun(i32 %i) { ; CHECK: ret ret void } diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll index fbc4cd9d4f9c0..86469dad23f22 100644 --- a/test/CodeGen/X86/hipe-cc.ll +++ b/test/CodeGen/X86/hipe-cc.ll @@ -48,11 +48,7 @@ entry: store i32 %arg0, i32* %arg0_var store i32 %arg1, i32* %arg1_var store i32 %arg2, i32* %arg2_var - - ; CHECK: movl 16(%esp), %esi - ; CHECK-NEXT: movl 12(%esp), %ebp - ; CHECK-NEXT: movl 8(%esp), %eax - ; CHECK-NEXT: movl 4(%esp), %edx + ; These loads are loading the values from their previous stores and are optimized away. %0 = load i32, i32* %hp_var %1 = load i32, i32* %p_var %2 = load i32, i32* %arg0_var diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll index 43e2e1409fdee..efe07cf6301e9 100644 --- a/test/CodeGen/X86/hipe-cc64.ll +++ b/test/CodeGen/X86/hipe-cc64.ll @@ -57,11 +57,7 @@ entry: store i64 %arg2, i64* %arg2_var store i64 %arg3, i64* %arg3_var - ; CHECK: movq 40(%rsp), %r15 - ; CHECK-NEXT: movq 32(%rsp), %rbp - ; CHECK-NEXT: movq 24(%rsp), %rsi - ; CHECK-NEXT: movq 16(%rsp), %rdx - ; CHECK-NEXT: movq 8(%rsp), %rcx + ; Loads are reading values just writen from corresponding register and are therefore noops. %0 = load i64, i64* %hp_var %1 = load i64, i64* %p_var %2 = load i64, i64* %arg0_var diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll new file mode 100644 index 0000000000000..e42ce30c5a6d2 --- /dev/null +++ b/test/CodeGen/X86/lea32-schedule.ll @@ -0,0 +1,653 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i32 @test_lea_offset(i32) { +; GENERIC-LABEL: test_lea_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal -24(%rdi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal -24(%rdi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal -24(%rdi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i32 %0, -24 + ret i32 %2 +} + +define i32 @test_lea_offset_big(i32) { +; GENERIC-LABEL: test_lea_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal 1024(%rdi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal 1024(%rdi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal 1024(%rdi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i32 %0, 1024 + ret i32 %2 +} + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @test_lea_add(i32, i32) { +; GENERIC-LABEL: test_lea_add: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal (%rdi,%rsi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal (%rdi,%rsi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add nsw i32 %1, %0 + ret i32 %3 +} + +define i32 @test_lea_add_offset(i32, i32) { +; GENERIC-LABEL: test_lea_add_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal 16(%rdi,%rsi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $16, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $16, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i32 %0, 16 + %4 = add i32 %3, %1 + ret i32 %4 +} + +define i32 @test_lea_add_offset_big(i32, i32) { +; GENERIC-LABEL: test_lea_add_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal -4096(%rdi,%rsi), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $-4096, %eax # imm = 0xF000 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $-4096, %eax # imm = 0xF000 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i32 %0, -4096 + %4 = add i32 %3, %1 + ret i32 %4 +} + +define i32 @test_lea_mul(i32) { +; GENERIC-LABEL: test_lea_mul: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal (%rdi,%rdi,2), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i32 %0, 3 + ret i32 %2 +} + +define i32 @test_lea_mul_offset(i32) { +; GENERIC-LABEL: test_lea_mul_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal -32(%rdi,%rdi,2), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $-32, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $-32, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i32 %0, 3 + %3 = add nsw i32 %2, -32 + ret i32 %3 +} + +define i32 @test_lea_mul_offset_big(i32) { +; GENERIC-LABEL: test_lea_mul_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal 10000(%rdi,%rdi,8), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $10000, %eax # imm = 0x2710 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $10000, %eax # imm = 0x2710 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i32 %0, 9 + %3 = add nsw i32 %2, 10000 + ret i32 %3 +} + +define i32 @test_lea_add_scale(i32, i32) { +; GENERIC-LABEL: test_lea_add_scale: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal (%rdi,%rsi,2), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i32 %1, 1 + %4 = add nsw i32 %3, %0 + ret i32 %4 +} + +define i32 @test_lea_add_scale_offset(i32, i32) { +; GENERIC-LABEL: test_lea_add_scale_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal 96(%rdi,%rsi,4), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $96, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $96, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i32 %1, 2 + %4 = add i32 %0, 96 + %5 = add i32 %4, %3 + ret i32 %5 +} + +define i32 @test_lea_add_scale_offset_big(i32, i32) { +; GENERIC-LABEL: test_lea_add_scale_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; GENERIC-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; GENERIC-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ATOM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ATOM-NEXT: leal -1200(%rdi,%rsi,8), %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset_big: +; SLM: # BB#0: +; SLM-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SLM-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; SANDY-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; SANDY-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50] +; SANDY-NEXT: addl $-1200, %eax # imm = 0xFB50 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; HASWELL-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50] +; HASWELL-NEXT: addl $-1200, %eax # imm = 0xFB50 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; BTVER2-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def> +; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; ZNVER1-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i32 %1, 3 + %4 = add i32 %0, -1200 + %5 = add i32 %4, %3 + ret i32 %5 +} diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll new file mode 100644 index 0000000000000..0ff1574c809df --- /dev/null +++ b/test/CodeGen/X86/lea64-schedule.ll @@ -0,0 +1,534 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i64 @test_lea_offset(i64) { +; GENERIC-LABEL: test_lea_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -24(%rdi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq -24(%rdi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -24(%rdi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i64 %0, -24 + ret i64 %2 +} + +define i64 @test_lea_offset_big(i64) { +; GENERIC-LABEL: test_lea_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 1024(%rdi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq 1024(%rdi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = add nsw i64 %0, 1024 + ret i64 %2 +} + +; Function Attrs: norecurse nounwind readnone uwtable +define i64 @test_lea_add(i64, i64) { +; GENERIC-LABEL: test_lea_add: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add: +; ATOM: # BB#0: +; ATOM-NEXT: leaq (%rdi,%rsi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add: +; SLM: # BB#0: +; SLM-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add nsw i64 %1, %0 + ret i64 %3 +} + +define i64 @test_lea_add_offset(i64, i64) { +; GENERIC-LABEL: test_lea_add_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 16(%rdi,%rsi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $16, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $16, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i64 %0, 16 + %4 = add i64 %3, %1 + ret i64 %4 +} + +define i64 @test_lea_add_offset_big(i64, i64) { +; GENERIC-LABEL: test_lea_add_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -4096(%rdi,%rsi), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $-4096, %rax # imm = 0xF000 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $-4096, %rax # imm = 0xF000 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = add i64 %0, -4096 + %4 = add i64 %3, %1 + ret i64 %4 +} + +define i64 @test_lea_mul(i64) { +; GENERIC-LABEL: test_lea_mul: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul: +; ATOM: # BB#0: +; ATOM-NEXT: leaq (%rdi,%rdi,2), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul: +; SLM: # BB#0: +; SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i64 %0, 3 + ret i64 %2 +} + +define i64 @test_lea_mul_offset(i64) { +; GENERIC-LABEL: test_lea_mul_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -32(%rdi,%rdi,2), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $-32, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $-32, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i64 %0, 3 + %3 = add nsw i64 %2, -32 + ret i64 %3 +} + +define i64 @test_lea_mul_offset_big(i64) { +; GENERIC-LABEL: test_lea_mul_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_mul_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 10000(%rdi,%rdi,8), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_mul_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_mul_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $10000, %rax # imm = 0x2710 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_mul_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $10000, %rax # imm = 0x2710 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_mul_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %2 = mul nsw i64 %0, 9 + %3 = add nsw i64 %2, 10000 + ret i64 %3 +} + +define i64 @test_lea_add_scale(i64, i64) { +; GENERIC-LABEL: test_lea_add_scale: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale: +; ATOM: # BB#0: +; ATOM-NEXT: leaq (%rdi,%rsi,2), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale: +; SLM: # BB#0: +; SLM-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i64 %1, 1 + %4 = add nsw i64 %3, %0 + ret i64 %4 +} + +define i64 @test_lea_add_scale_offset(i64, i64) { +; GENERIC-LABEL: test_lea_add_scale_offset: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset: +; ATOM: # BB#0: +; ATOM-NEXT: leaq 96(%rdi,%rsi,4), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset: +; SLM: # BB#0: +; SLM-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $96, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $96, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i64 %1, 2 + %4 = add i64 %0, 96 + %5 = add i64 %4, %3 + ret i64 %5 +} + +define i64 @test_lea_add_scale_offset_big(i64, i64) { +; GENERIC-LABEL: test_lea_add_scale_offset_big: +; GENERIC: # BB#0: +; GENERIC-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_lea_add_scale_offset_big: +; ATOM: # BB#0: +; ATOM-NEXT: leaq -1200(%rdi,%rsi,8), %rax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lea_add_scale_offset_big: +; SLM: # BB#0: +; SLM-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lea_add_scale_offset_big: +; SANDY: # BB#0: +; SANDY-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50] +; SANDY-NEXT: addq $-1200, %rax # imm = 0xFB50 +; SANDY-NEXT: # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_lea_add_scale_offset_big: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50] +; HASWELL-NEXT: addq $-1200, %rax # imm = 0xFB50 +; HASWELL-NEXT: # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lea_add_scale_offset_big: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %3 = shl i64 %1, 3 + %4 = add i64 %0, -1200 + %5 = add i64 %4, %3 + ret i64 %5 +} diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll index b3f2116e6486d..3ad6cad32d834 100644 --- a/test/CodeGen/X86/legalize-shift-64.ll +++ b/test/CodeGen/X86/legalize-shift-64.ll @@ -148,8 +148,7 @@ define i32 @test6() { ; CHECK-NEXT: andl $-8, %esp ; CHECK-NEXT: subl $16, %esp ; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $1, (%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: shldl $32, %eax, %ecx @@ -175,9 +174,8 @@ define i32 @test6() { ; CHECK-NEXT: retl %x = alloca i32, align 4 %t = alloca i64, align 8 - store i32 1, i32* %x, align 4 - store i64 1, i64* %t, align 8 ;; DEAD - %load = load i32, i32* %x, align 4 + store volatile i32 1, i32* %x, align 4 + %load = load volatile i32, i32* %x, align 4 %shl = shl i32 %load, 8 %add = add i32 %shl, -224 %sh_prom = zext i32 %add to i64 diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll new file mode 100644 index 0000000000000..cd0dcbbd6afbd --- /dev/null +++ b/test/CodeGen/X86/lzcnt-schedule.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) { +; GENERIC-LABEL: test_ctlz_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: lzcntw (%rsi), %cx +; GENERIC-NEXT: lzcntw %di, %ax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_ctlz_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: lzcntw (%rsi), %cx +; HASWELL-NEXT: lzcntw %di, %ax +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctlz_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: lzcntw (%rsi), %cx +; BTVER2-NEXT: lzcntw %di, %ax +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntw (%rsi), %cx +; ZNVER1-NEXT: lzcntw %di, %ax +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a1 + %2 = tail call i16 @llvm.ctlz.i16( i16 %1, i1 false ) + %3 = tail call i16 @llvm.ctlz.i16( i16 %a0, i1 false ) + %4 = or i16 %2, %3 + ret i16 %4 +} +declare i16 @llvm.ctlz.i16(i16, i1) + +define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_ctlz_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: lzcntl (%rsi), %ecx +; GENERIC-NEXT: lzcntl %edi, %eax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_ctlz_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: lzcntl (%rsi), %ecx +; HASWELL-NEXT: lzcntl %edi, %eax +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctlz_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: lzcntl (%rsi), %ecx +; BTVER2-NEXT: lzcntl %edi, %eax +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntl (%rsi), %ecx +; ZNVER1-NEXT: lzcntl %edi, %eax +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = tail call i32 @llvm.ctlz.i32( i32 %1, i1 false ) + %3 = tail call i32 @llvm.ctlz.i32( i32 %a0, i1 false ) + %4 = or i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.ctlz.i32(i32, i1) + +define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_ctlz_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: lzcntq (%rsi), %rcx +; GENERIC-NEXT: lzcntq %rdi, %rax +; GENERIC-NEXT: orq %rcx, %rax +; GENERIC-NEXT: retq +; +; HASWELL-LABEL: test_ctlz_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: lzcntq (%rsi), %rcx +; HASWELL-NEXT: lzcntq %rdi, %rax +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctlz_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: lzcntq (%rsi), %rcx +; BTVER2-NEXT: lzcntq %rdi, %rax +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntq (%rsi), %rcx +; ZNVER1-NEXT: lzcntq %rdi, %rax +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = tail call i64 @llvm.ctlz.i64( i64 %1, i1 false ) + %3 = tail call i64 @llvm.ctlz.i64( i64 %a0, i1 false ) + %4 = or i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.ctlz.i64(i64, i1) diff --git a/test/CodeGen/X86/machine-outliner-debuginfo.ll b/test/CodeGen/X86/machine-outliner-debuginfo.ll index 26a194764086d..02d0964e37eb5 100644 --- a/test/CodeGen/X86/machine-outliner-debuginfo.ll +++ b/test/CodeGen/X86/machine-outliner-debuginfo.ll @@ -17,6 +17,7 @@ define i32 @main() #0 !dbg !11 { call void @llvm.dbg.value(metadata i32 10, i64 0, metadata !15, metadata !16), !dbg !17 store i32 4, i32* %5, align 4 store i32 0, i32* @x, align 4, !dbg !24 + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; This is the same sequence of instructions without a debug value. It should be outlined ; in the same way. ; CHECK: callq l_OUTLINED_FUNCTION_0 diff --git a/test/CodeGen/X86/machine-outliner.ll b/test/CodeGen/X86/machine-outliner.ll index 9f8e6ec298f4e..b4a277ec2d825 100644 --- a/test/CodeGen/X86/machine-outliner.ll +++ b/test/CodeGen/X86/machine-outliner.ll @@ -85,6 +85,7 @@ define i32 @main() #0 { store i32 3, i32* %4, align 4 store i32 4, i32* %5, align 4 store i32 1, i32* @x, align 4 + call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() ; CHECK: callq [[OFUNC2]] store i32 1, i32* %2, align 4 store i32 2, i32* %3, align 4 diff --git a/test/CodeGen/X86/memcmp-minsize.ll b/test/CodeGen/X86/memcmp-minsize.ll new file mode 100644 index 0000000000000..a7f42644ca2d5 --- /dev/null +++ b/test/CodeGen/X86/memcmp-minsize.ll @@ -0,0 +1,721 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare i32 @memcmp(i8*, i8*, i64) + +define i32 @length2(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $2, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length2: +; X64: # BB#0: +; X64-NEXT: pushq $2 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length2_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw $12849, (%eax) # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_const: +; X64: # BB#0: +; X64-NEXT: cmpw $12849, (%rdi) # imm = 0x3231 +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $2, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_nobuiltin_attr: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $2 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length3: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length3: +; X64: # BB#0: +; X64-NEXT: pushq $3 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length3_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $3, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length3_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $3 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length4: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $4, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length4: +; X64: # BB#0: +; X64-NEXT: pushq $4 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length4_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq: +; X64: # BB#0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length4_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq_const: +; X64: # BB#0: +; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length5: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $5, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length5: +; X64: # BB#0: +; X64-NEXT: pushq $5 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length5_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $5, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length5_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $5 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length8: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $8, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length8: +; X64: # BB#0: +; X64-NEXT: pushq $8 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length8_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $8, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length8_eq: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length8_eq_const: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $8, {{[0-9]+}}(%esp) +; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length8_eq_const: +; X64: # BB#0: +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length12_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $12, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $12 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length12: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $12, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12: +; X64: # BB#0: +; X64-NEXT: pushq $12 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length16: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $16, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length16: +; X64: # BB#0: +; X64-NEXT: pushq $16 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: subl $16, %esp +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) +; X86-NOSSE-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl $16, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rsi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind minsize { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: subl $16, %esp +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl %eax, (%esp) +; X86-NOSSE-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl $16, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length16_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax +; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length32: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $32, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length32: +; X64: # BB#0: +; X64-NEXT: pushq $32 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize { +; X86-LABEL: length32_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $32, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: pushq $32 +; X64-SSE2-NEXT: popq %rdx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length32_eq_const: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $32, {{[0-9]+}}(%esp) +; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: pushq $32 +; X64-SSE2-NEXT: popq %rdx +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length64: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $64, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64: +; X64: # BB#0: +; X64-NEXT: pushq $64 +; X64-NEXT: popq %rdx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize { +; X86-LABEL: length64_eq: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $64, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $64 +; X64-NEXT: popq %rdx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length64_eq_const: +; X86: # BB#0: +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: andl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $64, {{[0-9]+}}(%esp) +; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64_eq_const: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: pushq $64 +; X64-NEXT: popq %rdx +; X64-NEXT: movl $.L.str, %esi +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll new file mode 100644 index 0000000000000..450205a966d23 --- /dev/null +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -0,0 +1,871 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare i32 @memcmp(i8*, i8*, i64) + +define i32 @length2(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2: +; X86: # BB#0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: incl %edi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: cmovael %edi, %eax +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: length2: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length2_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_const: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_nobuiltin_attr: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $2, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length3: +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: movzwl %dx, %edx +; X86-NEXT: movzwl %si, %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: .LBB4_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length3: +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length3_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: je .LBB5_3 +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length3_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 2(%rsi), %cl +; X64-NEXT: je .LBB5_3 +; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB5_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length4: +; X86: # BB#0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: incl %edi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovael %edi, %eax +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: length4: +; X64: # BB#0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length4_eq: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq: +; X64: # BB#0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length4_eq_const: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq_const: +; X64: # BB#0: +; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length5: +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: .LBB9_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length5: +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length5_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: je .LBB10_3 +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length5_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 4(%rsi), %cl +; X64-NEXT: je .LBB10_3 +; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB10_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length8: +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB11_3 +; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: incl %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovael %esi, %eax +; X86-NEXT: .LBB11_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length8: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax +; X64-NEXT: bswapq %rcx +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length8_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: je .LBB12_3 +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length8_eq_const: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB13_3: # %endblock +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq_const: +; X64: # BB#0: +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length12_eq: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length12_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl 8(%rsi), %ecx +; X64-NEXT: je .LBB14_3 +; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB14_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length12: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length16: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length16: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-LABEL: length16_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 8(%rsi), %rcx +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB17_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind optsize { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # BB#0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # BB#0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-LABEL: length16_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rcx, 8(%rdi) +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB18_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length32: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length32: +; X64: # BB#0: +; X64-NEXT: movl $32, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { +; X86-LABEL: length32_eq: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $32, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length32_eq_const: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq_const: +; X64-SSE2: # BB#0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $32, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq_const: +; X64-AVX2: # BB#0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax +; X64-AVX2-NEXT: cmpl $-1, %eax +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length64: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64: +; X64: # BB#0: +; X64-NEXT: movl $64, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { +; X86-LABEL: length64_eq: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length64_eq: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $64, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length64_eq_const: +; X86: # BB#0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length64_eq_const: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $.L.str, %esi +; X64-NEXT: movl $64, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 0e09abf73c8c9..2e67827654624 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=AVX2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -12,43 +12,21 @@ declare i32 @memcmp(i8*, i8*, i64) define i32 @length2(i8* %X, i8* %Y) nounwind { -; X86-NOSSE-LABEL: length2: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movzwl (%ecx), %ecx -; X86-NOSSE-NEXT: movzwl (%eax), %eax -; X86-NOSSE-NEXT: rolw $8, %cx -; X86-NOSSE-NEXT: rolw $8, %ax -; X86-NOSSE-NEXT: cmpw %ax, %cx -; X86-NOSSE-NEXT: movl $-1, %eax -; X86-NOSSE-NEXT: jae .LBB0_1 -; X86-NOSSE-NEXT: # BB#2: -; X86-NOSSE-NEXT: je .LBB0_3 -; X86-NOSSE-NEXT: .LBB0_4: -; X86-NOSSE-NEXT: retl -; X86-NOSSE-NEXT: .LBB0_1: -; X86-NOSSE-NEXT: movl $1, %eax -; X86-NOSSE-NEXT: jne .LBB0_4 -; X86-NOSSE-NEXT: .LBB0_3: -; X86-NOSSE-NEXT: xorl %eax, %eax -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length2: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movzwl (%ecx), %ecx -; X86-SSE2-NEXT: movzwl (%eax), %eax -; X86-SSE2-NEXT: rolw $8, %cx -; X86-SSE2-NEXT: rolw $8, %ax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: cmpw %ax, %cx -; X86-SSE2-NEXT: movl $-1, %ecx -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: cmovbl %ecx, %eax -; X86-SSE2-NEXT: cmovel %edx, %eax -; X86-SSE2-NEXT: retl +; X86-LABEL: length2: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %ax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: retl ; ; X64-LABEL: length2: ; X64: # BB#0: @@ -137,44 +115,90 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { define i32 @length3(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: movzwl %dx, %edx +; X86-NEXT: movzwl %si, %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length3: -; X64: # BB#0: -; X64-NEXT: movl $3, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind ret i32 %m } define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: je .LBB5_3 +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB5_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $3, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 2(%rsi), %cl +; X64-NEXT: je .LBB5_3 +; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind %c = icmp ne i32 %m, 0 @@ -182,43 +206,21 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { } define i32 @length4(i8* %X, i8* %Y) nounwind { -; X86-NOSSE-LABEL: length4: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl (%ecx), %ecx -; X86-NOSSE-NEXT: movl (%eax), %eax -; X86-NOSSE-NEXT: bswapl %ecx -; X86-NOSSE-NEXT: bswapl %eax -; X86-NOSSE-NEXT: cmpl %eax, %ecx -; X86-NOSSE-NEXT: movl $-1, %eax -; X86-NOSSE-NEXT: jae .LBB6_1 -; X86-NOSSE-NEXT: # BB#2: -; X86-NOSSE-NEXT: je .LBB6_3 -; X86-NOSSE-NEXT: .LBB6_4: -; X86-NOSSE-NEXT: retl -; X86-NOSSE-NEXT: .LBB6_1: -; X86-NOSSE-NEXT: movl $1, %eax -; X86-NOSSE-NEXT: jne .LBB6_4 -; X86-NOSSE-NEXT: .LBB6_3: -; X86-NOSSE-NEXT: xorl %eax, %eax -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length4: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl (%ecx), %ecx -; X86-SSE2-NEXT: movl (%eax), %eax -; X86-SSE2-NEXT: bswapl %ecx -; X86-SSE2-NEXT: bswapl %eax -; X86-SSE2-NEXT: xorl %edx, %edx -; X86-SSE2-NEXT: cmpl %eax, %ecx -; X86-SSE2-NEXT: movl $-1, %ecx -; X86-SSE2-NEXT: movl $1, %eax -; X86-SSE2-NEXT: cmovbl %ecx, %eax -; X86-SSE2-NEXT: cmovel %edx, %eax -; X86-SSE2-NEXT: retl +; X86-LABEL: length4: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: retl ; ; X64-LABEL: length4: ; X64: # BB#0: @@ -278,44 +280,86 @@ define i1 @length4_eq_const(i8* %X) nounwind { define i32 @length5(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length5: -; X64: # BB#0: -; X64-NEXT: movl $5, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind ret i32 %m } define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: je .LBB10_3 +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB10_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $5, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 4(%rsi), %cl +; X64-NEXT: je .LBB10_3 +; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind %c = icmp ne i32 %m, 0 @@ -324,13 +368,33 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length8: @@ -352,13 +416,20 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: je .LBB12_3 +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl @@ -376,13 +447,17 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -400,25 +475,43 @@ define i1 @length8_eq_const(i8* %X) nounwind { define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length12_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 8(%eax), %edx +; X86-NEXT: je .LBB14_4 +; X86-NEXT: .LBB14_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB14_4: # %endblock +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $12, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl 8(%rsi), %ecx +; X64-NEXT: je .LBB14_3 +; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind %c = icmp ne i32 %m, 0 @@ -427,19 +520,66 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length12: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#4: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: -; X64-NEXT: movl $12, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m } @@ -448,111 +588,165 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length16: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#5: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB16_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: -; X64-NEXT: movl $16, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m } define i1 @length16_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rsi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq +; X86-LABEL: length16_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: cmpl 8(%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: movl 12(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 12(%eax), %edx +; X86-NEXT: je .LBB17_5 +; X86-NEXT: .LBB17_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB17_5: # %endblock +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl ; -; X64-AVX2-LABEL: length16_eq: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 8(%rsi), %rcx +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB17_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp } define i1 @length16_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq +; X86-LABEL: length16_eq_const: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $858927408, (%eax) # imm = 0x33323130 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: cmpl $926299444, 4(%eax) # imm = 0x37363534 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: cmpl $825243960, 8(%eax) # imm = 0x31303938 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl $892613426, 12(%eax) # imm = 0x35343332 +; X86-NEXT: je .LBB18_5 +; X86-NEXT: .LBB18_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB18_5: # %endblock +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl ; -; X64-AVX2-LABEL: length16_eq_const: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rcx, 8(%rdi) +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB18_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -570,9 +764,43 @@ define i32 @length32(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length32: -; X64: # BB#0: -; X64-NEXT: movl $32, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movq 16(%rdi), %rcx +; X64-NEXT: movq 16(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: movq 24(%rdi), %rcx +; X64-NEXT: movq 24(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#5: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB19_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind ret i32 %m } @@ -592,25 +820,30 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-LABEL: length32_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rax +; X64-NEXT: cmpq 8(%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: cmpq 16(%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: movq 24(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 24(%rsi), %rcx +; X64-NEXT: je .LBB20_5 +; X64-NEXT: .LBB20_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB20_5: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -629,26 +862,30 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq_const: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-LABEL: length32_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movabsq $3833745473465760056, %rax # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rax, 8(%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; X64-NEXT: cmpq %rax, 16(%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3544395820347831604, %rcx # imm = 0x3130393837363534 +; X64-NEXT: cmpq %rcx, 24(%rdi) +; X64-NEXT: je .LBB21_5 +; X64-NEXT: .LBB21_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB21_5: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind %c = icmp ne i32 %m, 0 ret i1 %c diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 50a661fcca114..76d750855cd4f 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -105,7 +105,7 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind { ; ; AVX-LABEL: mul_v4i32c: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117] ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -523,7 +523,7 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { ; ; AVX-LABEL: mul_v8i32c: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117] ; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq entry: @@ -551,7 +551,7 @@ define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { ; ; AVX-LABEL: mul_v4i64c: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117] ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll new file mode 100644 index 0000000000000..c0d11280fc1da --- /dev/null +++ b/test/CodeGen/X86/popcnt-schedule.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+popcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) { +; GENERIC-LABEL: test_ctpop_i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: popcntw (%rsi), %cx +; GENERIC-NEXT: popcntw %di, %ax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_ctpop_i16: +; SLM: # BB#0: +; SLM-NEXT: popcntw (%rsi), %cx # sched: [6:1.00] +; SLM-NEXT: popcntw %di, %ax # sched: [3:1.00] +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ctpop_i16: +; SANDY: # BB#0: +; SANDY-NEXT: popcntw (%rsi), %cx # sched: [7:1.00] +; SANDY-NEXT: popcntw %di, %ax # sched: [3:1.00] +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_ctpop_i16: +; HASWELL: # BB#0: +; HASWELL-NEXT: popcntw (%rsi), %cx # sched: [7:1.00] +; HASWELL-NEXT: popcntw %di, %ax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctpop_i16: +; BTVER2: # BB#0: +; BTVER2-NEXT: popcntw (%rsi), %cx # sched: [8:1.00] +; BTVER2-NEXT: popcntw %di, %ax # sched: [3:1.00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntw (%rsi), %cx # sched: [10:1.00] +; ZNVER1-NEXT: popcntw %di, %ax # sched: [3:1.00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i16, i16 *%a1 + %2 = tail call i16 @llvm.ctpop.i16( i16 %1 ) + %3 = tail call i16 @llvm.ctpop.i16( i16 %a0 ) + %4 = or i16 %2, %3 + ret i16 %4 +} +declare i16 @llvm.ctpop.i16(i16) + +define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_ctpop_i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: popcntl (%rsi), %ecx +; GENERIC-NEXT: popcntl %edi, %eax +; GENERIC-NEXT: orl %ecx, %eax +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_ctpop_i32: +; SLM: # BB#0: +; SLM-NEXT: popcntl (%rsi), %ecx # sched: [6:1.00] +; SLM-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ctpop_i32: +; SANDY: # BB#0: +; SANDY-NEXT: popcntl (%rsi), %ecx # sched: [7:1.00] +; SANDY-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_ctpop_i32: +; HASWELL: # BB#0: +; HASWELL-NEXT: popcntl (%rsi), %ecx # sched: [7:1.00] +; HASWELL-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctpop_i32: +; BTVER2: # BB#0: +; BTVER2-NEXT: popcntl (%rsi), %ecx # sched: [8:1.00] +; BTVER2-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntl (%rsi), %ecx # sched: [10:1.00] +; ZNVER1-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i32, i32 *%a1 + %2 = tail call i32 @llvm.ctpop.i32( i32 %1 ) + %3 = tail call i32 @llvm.ctpop.i32( i32 %a0 ) + %4 = or i32 %2, %3 + ret i32 %4 +} +declare i32 @llvm.ctpop.i32(i32) + +define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_ctpop_i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: popcntq (%rsi), %rcx +; GENERIC-NEXT: popcntq %rdi, %rax +; GENERIC-NEXT: orq %rcx, %rax +; GENERIC-NEXT: retq +; +; SLM-LABEL: test_ctpop_i64: +; SLM: # BB#0: +; SLM-NEXT: popcntq (%rsi), %rcx # sched: [6:1.00] +; SLM-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ctpop_i64: +; SANDY: # BB#0: +; SANDY-NEXT: popcntq (%rsi), %rcx # sched: [9:1.00] +; SANDY-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_ctpop_i64: +; HASWELL: # BB#0: +; HASWELL-NEXT: popcntq (%rsi), %rcx # sched: [7:1.00] +; HASWELL-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ctpop_i64: +; BTVER2: # BB#0: +; BTVER2-NEXT: popcntq (%rsi), %rcx # sched: [8:1.00] +; BTVER2-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntq (%rsi), %rcx # sched: [10:1.00] +; ZNVER1-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] + %1 = load i64, i64 *%a1 + %2 = tail call i64 @llvm.ctpop.i64( i64 %1 ) + %3 = tail call i64 @llvm.ctpop.i64( i64 %a0 ) + %4 = or i64 %2, %3 + ret i64 %4 +} +declare i64 @llvm.ctpop.i64(i64) diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll new file mode 100644 index 0000000000000..26c4bdb2375ab --- /dev/null +++ b/test/CodeGen/X86/pr32282.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X64 + +; Check for assert in foldMaskAndShiftToScale due to out of range mask scaling. + +@b = common global i8 zeroinitializer, align 1 +@c = common global i8 zeroinitializer, align 1 +@d = common global i64 zeroinitializer, align 8 +@e = common global i64 zeroinitializer, align 8 + +define void @foo() { +; X86-LABEL: foo: +; X86: # BB#0: +; X86-NEXT: pushl %eax +; X86-NEXT: .Lcfi0: +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: movl d, %eax +; X86-NEXT: movl d+4, %ecx +; X86-NEXT: movl $701685459, %edx # imm = 0x29D2DED3 +; X86-NEXT: andnl %edx, %ecx, %ecx +; X86-NEXT: movl $-564453154, %edx # imm = 0xDE5B20DE +; X86-NEXT: andnl %edx, %eax, %edx +; X86-NEXT: shrdl $21, %ecx, %edx +; X86-NEXT: shrl $21, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testb %al, %al +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: cmovnel %eax, %ecx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: addl $7, %edx +; X86-NEXT: adcxl %eax, %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: .Lcfi1: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: pushl %edx +; X86-NEXT: .Lcfi2: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: pushl $0 +; X86-NEXT: .Lcfi3: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: pushl $0 +; X86-NEXT: .Lcfi4: +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: calll __divdi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: .Lcfi5: +; X86-NEXT: .cfi_adjust_cfa_offset -16 +; X86-NEXT: orl %eax, %edx +; X86-NEXT: setne {{[0-9]+}}(%esp) +; X86-NEXT: popl %eax +; X86-NEXT: retl +; +; X64-LABEL: foo: +; X64: # BB#0: +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000 +; X64-NEXT: andnq %rcx, %rax, %rcx +; X64-NEXT: shrq $21, %rcx +; X64-NEXT: addq $7, %rcx +; X64-NEXT: movabsq $4393751543808, %rax # imm = 0x3FF00000000 +; X64-NEXT: testq %rax, %rcx +; X64-NEXT: je .LBB0_1 +; X64-NEXT: # BB#2: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: idivq %rcx +; X64-NEXT: jmp .LBB0_3 +; X64-NEXT: .LBB0_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %ecx +; X64-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def> +; X64-NEXT: .LBB0_3: +; X64-NEXT: testq %rax, %rax +; X64-NEXT: setne -{{[0-9]+}}(%rsp) +; X64-NEXT: retq + %1 = alloca i8, align 1 + %2 = load i64, i64* @d, align 8 + %3 = or i64 -3013716102214263007, %2 + %4 = xor i64 %3, -1 + %5 = load i64, i64* @e, align 8 + %6 = load i8, i8* @b, align 1 + %7 = trunc i8 %6 to i1 + %8 = zext i1 %7 to i64 + %9 = xor i64 %5, %8 + %10 = load i8, i8* @c, align 1 + %11 = trunc i8 %10 to i1 + %12 = zext i1 %11 to i32 + %13 = or i32 551409149, %12 + %14 = sub nsw i32 %13, 551409131 + %15 = zext i32 %14 to i64 + %16 = shl i64 %9, %15 + %17 = sub nsw i64 %16, 223084523 + %18 = ashr i64 %4, %17 + %19 = and i64 %18, 9223372036854775806 + %20 = add nsw i64 7, %19 + %21 = sdiv i64 0, %20 + %22 = icmp ne i64 %21, 0 + %23 = zext i1 %22 to i8 + store i8 %23, i8* %1, align 1 + ret void +} diff --git a/test/CodeGen/X86/pr32515.ll b/test/CodeGen/X86/pr32515.ll new file mode 100644 index 0000000000000..aeb6803867aaa --- /dev/null +++ b/test/CodeGen/X86/pr32515.ll @@ -0,0 +1,29 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s +; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s +; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s +; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s +; REQUIRES: asserts + +@var_26 = external global i16, align 2 + +define void @foo() #0 { + %1 = alloca i16, align 2 + %2 = load i16, i16* @var_26, align 2 + %3 = zext i16 %2 to i32 + %4 = icmp ne i32 %3, 7 + %5 = zext i1 %4 to i16 + store i16 %5, i16* %1, align 2 + %6 = load i16, i16* @var_26, align 2 + %7 = zext i16 %6 to i32 + %8 = and i32 1, %7 + %9 = shl i32 %8, 0 + %10 = load i16, i16* @var_26, align 2 + %11 = zext i16 %10 to i32 + %12 = icmp ne i32 %11, 7 + %13 = zext i1 %12 to i32 + %14 = and i32 %9, %13 + %15 = icmp ne i32 %14, 0 + %16 = zext i1 %15 to i8 + store i8 %16, i8* undef, align 1 + unreachable + } diff --git a/test/CodeGen/X86/pr33772.ll b/test/CodeGen/X86/pr33772.ll new file mode 100644 index 0000000000000..ff22c7478866b --- /dev/null +++ b/test/CodeGen/X86/pr33772.ll @@ -0,0 +1,15 @@ +; RUN: not llc < %s -mcpu=skylake-avx512 2>&1 | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +; make sure we don't crash if scale for gather isn't constant. + +; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.x86.avx512.gather.dpi.512 +declare <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, i16, i32) + +define internal <16 x i32> @__gather_base_offsets32_i32(i8* readonly %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x i8> %vecmask) { + %mask_vec_i1.i.i = icmp ne <16 x i8> %vecmask, zeroinitializer + %mask_i16.i = bitcast <16 x i1> %mask_vec_i1.i.i to i16 + %res = tail call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> undef, i8* %ptr, <16 x i32> %offsets, i16 %mask_i16.i, i32 %offset_scale) + ret <16 x i32> %res +} diff --git a/test/CodeGen/X86/pr33828.ll b/test/CodeGen/X86/pr33828.ll new file mode 100644 index 0000000000000..1b7f44323b612 --- /dev/null +++ b/test/CodeGen/X86/pr33828.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X64 + +@var_580 = external local_unnamed_addr global i8, align 1 + +define void @foo() { +; X86-LABEL: foo: +; X86: # BB#0: # %entry +; X86-NEXT: movsbl var_580, %eax +; X86-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF +; X86-NEXT: jne .LBB0_1 +; X86-NEXT: # BB#2: # %if.end13 +; X86-NEXT: retl +; X86-NEXT: .LBB0_1: # %if.then11 +; +; X64-LABEL: foo: +; X64: # BB#0: # %entry +; X64-NEXT: movsbl {{.*}}(%rip), %eax +; X64-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF +; X64-NEXT: jne .LBB0_1 +; X64-NEXT: # BB#2: # %if.end13 +; X64-NEXT: retq +; X64-NEXT: .LBB0_1: # %if.then11 +entry: + %tmp = icmp ugt i8 undef, 60 + %phitmp = zext i1 %tmp to i16 + br label %if.end + +if.end: + %tmp1 = load i8, i8* @var_580, align 1 + %conv7 = sext i8 %tmp1 to i32 + %conv8 = zext i16 %phitmp to i32 + %mul = shl nuw nsw i32 %conv8, 1 + %div9 = udiv i32 %mul, 71 + %sub = add nsw i32 %div9, -3 + %shl = shl i32 1, %sub + %neg = xor i32 %shl, -1 + %and = and i32 %neg, %conv7 + %tobool10 = icmp eq i32 %and, 0 + br i1 %tobool10, label %if.end13, label %if.then11 + +if.then11: + unreachable + +if.end13: + ret void +} diff --git a/test/CodeGen/X86/regparm.ll b/test/CodeGen/X86/regparm.ll index 9484e5a9490bd..f427010edc516 100644 --- a/test/CodeGen/X86/regparm.ll +++ b/test/CodeGen/X86/regparm.ll @@ -1,4 +1,4 @@ -; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck -check-prefix=CHECK %s +; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck %s ; RUN: llc %s -mtriple=i386-pc-win32 -o - | FileCheck -check-prefix=WIN %s ; RUN: llc %s -mtriple=i386-pc-linux -fast-isel -o - | FileCheck -check-prefix=FAST %s ; RUN: llc %s -mtriple=i386-pc-win32 -fast-isel -o - | FileCheck -check-prefix=FASTWIN %s diff --git a/test/CodeGen/X86/rotate_vec.ll b/test/CodeGen/X86/rotate_vec.ll new file mode 100644 index 0000000000000..8fb000bae827d --- /dev/null +++ b/test/CodeGen/X86/rotate_vec.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s + +define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_splat: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd $31, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_non_splat: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4> + %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_splat_2masks: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd $31, %xmm0, %xmm0 +; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760> + + %3 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> + %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760> + %5 = or <4 x i32> %2, %4 + ret <4 x i32> %5 +} + +define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) { +; CHECK-LABEL: rot_v4i32_non_splat_2masks: +; CHECK: # BB#0: +; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4> + %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760> + + %3 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28> + %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760> + %5 = or <4 x i32> %2, %4 + ret <4 x i32> %5 +} diff --git a/test/CodeGen/X86/sibcall-win64.ll b/test/CodeGen/X86/sibcall-win64.ll index 204e1f8b050ba..b9d5a4813e09a 100644 --- a/test/CodeGen/X86/sibcall-win64.ll +++ b/test/CodeGen/X86/sibcall-win64.ll @@ -1,15 +1,15 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s -declare x86_64_win64cc void @win64_callee(i32) -declare x86_64_win64cc void (i32)* @win64_indirect() -declare x86_64_win64cc void @win64_other(i32) +declare win64cc void @win64_callee(i32) +declare win64cc void (i32)* @win64_indirect() +declare win64cc void @win64_other(i32) declare void @sysv_callee(i32) declare void (i32)* @sysv_indirect() declare void @sysv_other(i32) define void @sysv_caller(i32 %p1) { entry: - tail call x86_64_win64cc void @win64_callee(i32 %p1) + tail call win64cc void @win64_callee(i32 %p1) ret void } @@ -19,7 +19,7 @@ entry: ; CHECK: addq $40, %rsp ; CHECK: retq -define x86_64_win64cc void @win64_caller(i32 %p1) { +define win64cc void @win64_caller(i32 %p1) { entry: tail call void @sysv_callee(i32 %p1) ret void @@ -37,18 +37,18 @@ define void @sysv_matched(i32 %p1) { ; CHECK-LABEL: sysv_matched: ; CHECK: jmp sysv_callee # TAILCALL -define x86_64_win64cc void @win64_matched(i32 %p1) { - tail call x86_64_win64cc void @win64_callee(i32 %p1) +define win64cc void @win64_matched(i32 %p1) { + tail call win64cc void @win64_callee(i32 %p1) ret void } ; CHECK-LABEL: win64_matched: ; CHECK: jmp win64_callee # TAILCALL -define x86_64_win64cc void @win64_indirect_caller(i32 %p1) { - %1 = call x86_64_win64cc void (i32)* @win64_indirect() - call x86_64_win64cc void @win64_other(i32 0) - tail call x86_64_win64cc void %1(i32 %p1) +define win64cc void @win64_indirect_caller(i32 %p1) { + %1 = call win64cc void (i32)* @win64_indirect() + call win64cc void @win64_other(i32 0) + tail call win64cc void %1(i32 %p1) ret void } diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll index c41acd43b3ab6..29f726c3df6a7 100644 --- a/test/CodeGen/X86/sse-schedule.ll +++ b/test/CodeGen/X86/sse-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; GENERIC-LABEL: test_addps: @@ -45,6 +45,12 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fadd <4 x float> %1, %2 @@ -87,6 +93,12 @@ define float @test_addss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fadd float %1, %2 @@ -137,6 +149,12 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = and <4 x i32> %1, %2 @@ -191,6 +209,12 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1> @@ -245,6 +269,13 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fcmp oeq <4 x float> %a0, %2 @@ -290,6 +321,12 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = insertelement <4 x float> undef, float %a1, i32 0 %3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0) @@ -385,6 +422,20 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_comiss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vcomiss (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 4 %3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2) @@ -435,6 +486,13 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) { ; BTVER2-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2ss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i32 %a0 to float %2 = load i32, i32 *%a1, align 4 %3 = sitofp i32 %2 to float @@ -484,6 +542,13 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) { ; BTVER2-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2ssq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i64 %a0 to float %2 = load i64, i64 *%a1, align 8 %3 = sitofp i64 %2 to float @@ -533,6 +598,13 @@ define i32 @test_cvtss2si(float %a0, float *%a1) { ; BTVER2-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtss2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -585,6 +657,13 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) { ; BTVER2-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtss2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -637,6 +716,13 @@ define i32 @test_cvttss2si(float %a0, float *%a1) { ; BTVER2-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttss2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttss2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi float %a0 to i32 %2 = load float, float *%a1, align 4 %3 = fptosi float %2 to i32 @@ -686,6 +772,13 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) { ; BTVER2-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttss2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttss2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi float %a0 to i64 %2 = load float, float *%a1, align 4 %3 = fptosi float %2 to i64 @@ -729,6 +822,12 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fdiv <4 x float> %1, %2 @@ -771,6 +870,12 @@ define float @test_divss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fdiv float %1, %2 @@ -813,6 +918,12 @@ define void @test_ldmxcsr(i32 %a0) { ; BTVER2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; BTVER2-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ldmxcsr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50] +; ZNVER1-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* store i32 %a0, i32* %1 @@ -857,6 +968,12 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2) @@ -900,6 +1017,12 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2) @@ -943,6 +1066,12 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2) @@ -986,6 +1115,12 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2) @@ -1035,6 +1170,13 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) { ; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movaps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 16 %2 = fadd <4 x float> %1, %1 store <4 x float> %2, <4 x float> *%a1, align 16 @@ -1079,6 +1221,11 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhlps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> ret <4 x float> %1 } @@ -1129,6 +1276,13 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* %2 = load <2 x float>, <2 x float> *%1, align 8 %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> @@ -1177,6 +1331,12 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) { ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> %2 = fadd <4 x float> %a1, %1 ret <4 x float> %2 @@ -1224,6 +1384,13 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* %2 = load <2 x float>, <2 x float> *%1, align 8 %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> @@ -1266,6 +1433,11 @@ define i32 @test_movmskps(<4 x float> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ret i32 %1 } @@ -1307,6 +1479,11 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0 ret void } @@ -1353,6 +1530,13 @@ define void @test_movss_mem(float* %a0, float* %a1) { ; BTVER2-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movss_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovss %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float* %a0, align 1 %2 = fadd float %1, %1 store float %2, float *%a1, align 1 @@ -1395,6 +1579,11 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movss_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ret <4 x float> %1 } @@ -1441,6 +1630,13 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) { ; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movups: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovups (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 1 %2 = fadd <4 x float> %1, %1 store <4 x float> %2, <4 x float> *%a1, align 1 @@ -1483,6 +1679,12 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fmul <4 x float> %1, %2 @@ -1525,6 +1727,12 @@ define float @test_mulss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fmul float %1, %2 @@ -1575,6 +1783,12 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2 ; BTVER2-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = or <4 x i32> %1, %2 @@ -1621,6 +1835,11 @@ define void @test_prefetchnta(i8* %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: prefetchnta (%rdi) # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_prefetchnta: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: prefetchnta (%rdi) # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1) ret void } @@ -1670,6 +1889,13 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vrcpps %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrcpps (%rdi), %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2) @@ -1728,6 +1954,14 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) { ; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -1782,6 +2016,13 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2) @@ -1840,6 +2081,14 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) { ; BTVER2-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -1886,6 +2135,11 @@ define void @test_sfence() { ; BTVER2: # BB#0: ; BTVER2-NEXT: sfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: sfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse.sfence() ret void } @@ -1931,6 +2185,12 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50] ; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50] +; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 3, i32 4, i32 4> @@ -1980,6 +2240,13 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vsqrtps %xmm0, %xmm0 # sched: [21:21.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtps (%rdi), %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtps %xmm0, %xmm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2) @@ -2038,6 +2305,14 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2) @@ -2082,6 +2357,12 @@ define i32 @test_stmxcsr() { ; BTVER2-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; BTVER2-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_stmxcsr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50] +; ZNVER1-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -2126,6 +2407,12 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fsub <4 x float> %1, %2 @@ -2168,6 +2455,12 @@ define float @test_subss(float %a0, float %a1, float *%a2) { ; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fsub float %1, %2 @@ -2258,6 +2551,20 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ucomiss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vucomiss (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 4 %3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2) @@ -2306,6 +2613,12 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] ; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> @@ -2352,6 +2665,12 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] ; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> @@ -2402,6 +2721,12 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; BTVER2-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = xor <4 x i32> %1, %2 diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll index 3c36b21381390..6ee908e0c7871 100644 --- a/test/CodeGen/X86/sse2-schedule.ll +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_addpd: @@ -45,6 +45,12 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fadd <2 x double> %1, %2 @@ -87,6 +93,12 @@ define double @test_addsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fadd double %1, %2 @@ -135,6 +147,13 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = and <4 x i32> %1, %2 @@ -188,6 +207,13 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1> @@ -243,6 +269,13 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fcmp oeq <2 x double> %a0, %2 @@ -288,6 +321,12 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = insertelement <2 x double> undef, double %a1, i32 0 %3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0) @@ -383,6 +422,20 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_comisd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vcomisd (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 8 %3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2) @@ -433,6 +486,13 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> %2 = sitofp <2 x i32> %1 to <2 x double> %3 = load <4 x i32>, <4 x i32>*%a1, align 16 @@ -485,6 +545,13 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <4 x i32> %a0 to <4 x float> %2 = load <4 x i32>, <4 x i32>*%a1, align 16 %3 = sitofp <4 x i32> %2 to <4 x float> @@ -535,6 +602,13 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2) @@ -586,6 +660,13 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2) @@ -637,6 +718,13 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2) @@ -688,6 +776,13 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1> %2 = fpext <2 x float> %1 to <2 x double> %3 = load <4 x float>, <4 x float> *%a1, align 16 @@ -739,6 +834,13 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) { ; BTVER2-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsd2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtsd2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1) %3 = load double, double *%a1, align 8 @@ -791,6 +893,13 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) { ; BTVER2-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsd2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1) %3 = load double, double *%a1, align 8 @@ -850,6 +959,14 @@ define float @test_cvtsd2ss(double %a0, double *%a1) { ; BTVER2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2ss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptrunc double %a0 to float %2 = load double, double *%a1, align 8 %3 = fptrunc double %2 to float @@ -899,6 +1016,13 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) { ; BTVER2-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2sd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i32 %a0 to double %2 = load i32, i32 *%a1, align 8 %3 = sitofp i32 %2 to double @@ -948,6 +1072,13 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) { ; BTVER2-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2sdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i64 %a0 to double %2 = load i64, i64 *%a1, align 8 %3 = sitofp i64 %2 to double @@ -1006,6 +1137,14 @@ define double @test_cvtss2sd(float %a0, float *%a1) { ; BTVER2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2sd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fpext float %a0 to double %2 = load float, float *%a1, align 4 %3 = fpext float %2 to double @@ -1056,6 +1195,13 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <2 x double> %a0 to <2 x i32> %2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %3 = load <2 x double>, <2 x double> *%a1, align 16 @@ -1108,6 +1254,13 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <4 x float> %a0 to <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = fptosi <4 x float> %2 to <4 x i32> @@ -1157,6 +1310,13 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) { ; BTVER2-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttsd2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttsd2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi double %a0 to i32 %2 = load double, double *%a1, align 8 %3 = fptosi double %2 to i32 @@ -1206,6 +1366,13 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) { ; BTVER2-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttsd2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttsd2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi double %a0 to i64 %2 = load double, double *%a1, align 8 %3 = fptosi double %2 to i64 @@ -1249,6 +1416,12 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fdiv <2 x double> %1, %2 @@ -1291,6 +1464,12 @@ define double @test_divsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fdiv double %1, %2 @@ -1333,6 +1512,11 @@ define void @test_lfence() { ; BTVER2: # BB#0: ; BTVER2-NEXT: lfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.lfence() ret void } @@ -1374,6 +1558,11 @@ define void @test_mfence() { ; BTVER2: # BB#0: ; BTVER2-NEXT: mfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: mfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.mfence() ret void } @@ -1413,6 +1602,11 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovdqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) ret void } @@ -1454,6 +1648,12 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2) @@ -1497,6 +1697,12 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2) @@ -1540,6 +1746,12 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2) @@ -1583,6 +1795,12 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2) @@ -1632,6 +1850,13 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) { ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movapd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x double>, <2 x double> *%a0, align 16 %2 = fadd <2 x double> %1, %1 store <2 x double> %2, <2 x double> *%a1, align 16 @@ -1680,6 +1905,13 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) { ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovdqa (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x i64>, <2 x i64> *%a0, align 16 %2 = add <2 x i64> %1, %1 store <2 x i64> %2, <2 x i64> *%a1, align 16 @@ -1728,6 +1960,13 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) { ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movdqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovdqu (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x i64>, <2 x i64> *%a0, align 1 %2 = add <2 x i64> %1, %1 store <2 x i64> %2, <2 x i64> *%a1, align 1 @@ -1794,6 +2033,16 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) { ; BTVER2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vmovd %edi, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vmovd %xmm1, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x i32> undef, i32 %a1, i32 0 %2 = load i32, i32 *%a2 %3 = insertelement <4 x i32> undef, i32 %2, i32 0 @@ -1865,6 +2114,16 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) { ; BTVER2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movd_64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm1, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm0, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x i64> undef, i64 %a1, i64 0 %2 = load i64, i64 *%a2 %3 = insertelement <2 x i64> undef, i64 %2, i64 0 @@ -1918,6 +2177,13 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to double* %2 = load double, double *%1, align 8 %3 = insertelement <2 x double> %a1, double %2, i32 1 @@ -1969,6 +2235,13 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to double* %2 = load double, double *%1, align 8 %3 = insertelement <2 x double> %a1, double %2, i32 0 @@ -2010,6 +2283,11 @@ define i32 @test_movmskpd(<2 x double> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ret i32 %1 } @@ -2053,6 +2331,12 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) { ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <2 x i64> %a0, %a0 store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0 ret void @@ -2094,6 +2378,12 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <2 x double> %a0, %a0 store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0 ret void @@ -2141,6 +2431,13 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) { ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movq_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64* %a1, align 1 %2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0 %3 = add <2 x i64> %a0, %2 @@ -2187,6 +2484,12 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) { ; BTVER2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movq_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> %2 = add <2 x i64> %a1, %1 ret <2 x i64> %2 @@ -2234,6 +2537,13 @@ define void @test_movsd_mem(double* %a0, double* %a1) { ; BTVER2-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsd_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load double, double* %a0, align 1 %2 = fadd double %1, %1 store double %2, double *%a1, align 1 @@ -2277,6 +2587,11 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsd_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 0> ret <2 x double> %1 } @@ -2323,6 +2638,13 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) { ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movupd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovupd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x double>, <2 x double> *%a0, align 1 %2 = fadd <2 x double> %1, %1 store <2 x double> %2, <2 x double> *%a1, align 1 @@ -2365,6 +2687,12 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fmul <2 x double> %1, %2 @@ -2407,6 +2735,12 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fmul double %1, %2 @@ -2455,6 +2789,13 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = or <4 x i32> %1, %2 @@ -2510,6 +2851,12 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packssdw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <8 x i16> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -2562,6 +2909,12 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packsswb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -2614,6 +2967,12 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packuswb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -2662,6 +3021,12 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = add <16 x i8> %1, %2 @@ -2708,6 +3073,12 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = add <4 x i32> %1, %2 @@ -2750,6 +3121,12 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = add <2 x i64> %1, %2 @@ -2796,6 +3173,12 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2) @@ -2843,6 +3226,12 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2) @@ -2890,6 +3279,12 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddusb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2) @@ -2937,6 +3332,12 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddusw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2) @@ -2984,6 +3385,12 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = add <8 x i16> %1, %2 @@ -3032,6 +3439,13 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pand: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = and <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = and <2 x i64> %1, %2 @@ -3087,6 +3501,13 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pandn: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <2 x i64> %a0, <i64 -1, i64 -1> %2 = and <2 x i64> %a1, %1 %3 = load <2 x i64>, <2 x i64> *%a2, align 16 @@ -3136,6 +3557,12 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pavgb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2) @@ -3183,6 +3610,12 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pavgw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2) @@ -3234,6 +3667,13 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = icmp eq <16 x i8> %a0, %2 @@ -3286,6 +3726,13 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = icmp eq <4 x i32> %a0, %2 @@ -3338,6 +3785,13 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = icmp eq <8 x i16> %a0, %2 @@ -3391,6 +3845,13 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = icmp sgt <16 x i8> %a0, %2 @@ -3444,6 +3905,13 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = icmp eq <4 x i32> %a0, %2 @@ -3497,6 +3965,13 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = icmp sgt <8 x i16> %a0, %2 @@ -3541,6 +4016,12 @@ define i16 @test_pextrw(<8 x i16> %a0) { ; BTVER2-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <8 x i16> %a0, i32 6 ret i16 %1 } @@ -3585,6 +4066,12 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) { ; BTVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1 %2 = load i16, i16 *%a2 %3 = insertelement <8 x i16> %1, i16 %2, i32 3 @@ -3635,6 +4122,12 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaddwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <4 x i32> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -3683,6 +4176,12 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2) @@ -3730,6 +4229,12 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxub: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2) @@ -3777,6 +4282,12 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2) @@ -3824,6 +4335,12 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminub: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2) @@ -3863,6 +4380,11 @@ define i32 @test_pmovmskb(<16 x i8> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovmskb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ret i32 %1 } @@ -3904,6 +4426,12 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2) @@ -3947,6 +4475,12 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2) @@ -3990,6 +4524,12 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmullw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = mul <8 x i16> %1, %2 @@ -4040,6 +4580,12 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmuludq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <2 x i64> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -4090,6 +4636,13 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_por: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = or <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = or <2 x i64> %1, %2 @@ -4141,6 +4694,12 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psadbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) %2 = bitcast <2 x i64> %1 to <16 x i8> %3 = load <16 x i8>, <16 x i8> *%a2, align 16 @@ -4193,6 +4752,13 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50] +; ZNVER1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> @@ -4244,6 +4810,13 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) { ; BTVER2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufhw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [8:0.50] +; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6> %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4> @@ -4295,6 +4868,13 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) { ; BTVER2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshuflw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [8:0.50] +; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> @@ -4344,6 +4924,13 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pslld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2) @@ -4389,6 +4976,11 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pslldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 1, i32 2> ret <4 x i32> %1 } @@ -4435,6 +5027,13 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psllq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2) @@ -4486,6 +5085,13 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psllw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2) @@ -4537,6 +5143,13 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrad: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2) @@ -4588,6 +5201,13 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psraw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2) @@ -4639,6 +5259,13 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2) @@ -4684,6 +5311,11 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4> ret <4 x i32> %1 } @@ -4730,6 +5362,13 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrlq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2) @@ -4781,6 +5420,13 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrlw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2) @@ -4830,6 +5476,12 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = sub <16 x i8> %1, %2 @@ -4876,6 +5528,12 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = sub <4 x i32> %1, %2 @@ -4918,6 +5576,12 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = sub <2 x i64> %1, %2 @@ -4964,6 +5628,12 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2) @@ -5011,6 +5681,12 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2) @@ -5058,6 +5734,12 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubusb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2) @@ -5105,6 +5787,12 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubusw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2) @@ -5152,6 +5840,12 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = sub <8 x i16> %1, %2 @@ -5198,6 +5892,12 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] ; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> @@ -5248,6 +5948,13 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> @@ -5297,6 +6004,13 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) ; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhqdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3> %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 1, i32 3> @@ -5344,6 +6058,12 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> @@ -5390,6 +6110,12 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> @@ -5440,6 +6166,13 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> @@ -5489,6 +6222,13 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) ; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklqdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2> %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 0, i32 2> @@ -5536,6 +6276,12 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] ; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> @@ -5584,6 +6330,13 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pxor: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = xor <2 x i64> %1, %2 @@ -5633,6 +6386,13 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; BTVER2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 2> @@ -5683,6 +6443,13 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:21.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2) @@ -5741,6 +6508,14 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) @@ -5785,6 +6560,12 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fsub <2 x double> %1, %2 @@ -5827,6 +6608,12 @@ define double @test_subsd(double %a0, double %a1, double *%a2) { ; BTVER2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fsub double %1, %2 @@ -5917,6 +6704,20 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ucomisd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vucomisd (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 8 %3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2) @@ -5967,6 +6768,13 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 3> @@ -6022,6 +6830,13 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2> @@ -6071,6 +6886,13 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = xor <4 x i32> %1, %2 diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll index ef1ddae4532d4..ad38d1c6ff490 100644 --- a/test/CodeGen/X86/sse3-schedule.ll +++ b/test/CodeGen/X86/sse3-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_addsubpd: @@ -45,6 +45,12 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2) @@ -88,6 +94,12 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2) @@ -131,6 +143,12 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2) @@ -174,6 +192,12 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2) @@ -217,6 +241,12 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2) @@ -260,6 +290,12 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2) @@ -299,6 +335,11 @@ define <16 x i8> @test_lddqu(i8* %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lddqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vlddqu (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ret <16 x i8> %1 } @@ -347,6 +388,13 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movddup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50] +; ZNVER1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer @@ -397,6 +445,13 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movshdup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [8:0.50] +; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> @@ -447,6 +502,13 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsldup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [8:0.50] +; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll index 1ab1598fcab7c..26cca98816a3d 100644 --- a/test/CodeGen/X86/sse41-schedule.ll +++ b/test/CodeGen/X86/sse41-schedule.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_blendpd: @@ -43,6 +43,13 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fadd <2 x double> %a1, %1 @@ -80,6 +87,12 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> * ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3> @@ -122,6 +135,12 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) %2 = load <2 x double>, <2 x double> *%a3, align 16 %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2) @@ -165,6 +184,12 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) %2 = load <4 x float>, <4 x float> *%a3 %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2) @@ -202,6 +227,12 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7) @@ -239,6 +270,12 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2 ; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7) @@ -276,6 +313,12 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50] ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50] +; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) %2 = load float, float *%a2 %3 = insertelement <4 x float> %1, float %2, i32 3 @@ -308,6 +351,11 @@ define <2 x i64> @test_movntdqa(i8* %a0) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0) ret <2 x i64> %1 } @@ -343,6 +391,12 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00] ; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mpsadbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = bitcast <8 x i16> %1 to <16 x i8> %3 = load <16 x i8>, <16 x i8> *%a2, align 16 @@ -381,6 +435,12 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packusdw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <8 x i16> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -425,6 +485,12 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 ; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pblendvb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; ZNVER1-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) %2 = load <16 x i8>, <16 x i8> *%a3, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2) @@ -462,6 +528,12 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pblendw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] +; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15> @@ -498,6 +570,12 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <2 x i64> %a0, %a1 %2 = sext <2 x i1> %1 to <2 x i64> %3 = load <2 x i64>, <2 x i64>*%a2, align 16 @@ -536,6 +614,12 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) { ; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <16 x i8> %a0, i32 3 %2 = extractelement <16 x i8> %a0, i32 1 store i8 %2, i8 *%a1 @@ -573,6 +657,12 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) { ; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <4 x i32> %a0, i32 3 %2 = extractelement <4 x i32> %a0, i32 1 store i32 %2, i32 *%a1 @@ -609,6 +699,12 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) { ; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50] ; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <2 x i64> %a0, i32 1 %2 = extractelement <2 x i64> %a0, i32 1 store i64 %2, i64 *%a2 @@ -645,6 +741,12 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) { ; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <8 x i16> %a0, i32 3 %2 = extractelement <8 x i16> %a0, i32 1 store i16 %2, i16 *%a1 @@ -682,6 +784,12 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) { ; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phminposuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x i16>, <8 x i16> *%a0, align 16 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1) %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2) @@ -719,6 +827,12 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) { ; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1 %2 = load i8, i8 *%a2 %3 = insertelement <16 x i8> %1, i8 %2, i32 3 @@ -755,6 +869,12 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) { ; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1 %2 = load i32, i32 *%a2 %3 = insertelement <4 x i32> %1, i32 %2, i32 3 @@ -796,6 +916,13 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) { ; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1 %2 = load i64, i64 *%a3 %3 = insertelement <2 x i64> %a1, i64 %2, i32 1 @@ -833,6 +960,12 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2) @@ -870,6 +1003,12 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2) @@ -907,6 +1046,12 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxud: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2) @@ -944,6 +1089,12 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2) @@ -981,6 +1132,12 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2) @@ -1018,6 +1175,12 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2) @@ -1055,6 +1218,12 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminud: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2) @@ -1092,6 +1261,12 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2) @@ -1135,6 +1310,13 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) { ; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = sext <8 x i8> %1 to <8 x i16> %3 = load <8 x i8>, <8 x i8>* %a1, align 1 @@ -1179,6 +1361,13 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) { ; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %2 = sext <4 x i8> %1 to <4 x i32> %3 = load <4 x i8>, <4 x i8>* %a1, align 1 @@ -1223,6 +1412,13 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) { ; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> %2 = sext <2 x i8> %1 to <2 x i64> %3 = load <2 x i8>, <2 x i8>* %a1, align 1 @@ -1267,6 +1463,13 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) { ; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> %2 = sext <2 x i32> %1 to <2 x i64> %3 = load <2 x i32>, <2 x i32>* %a1, align 1 @@ -1311,6 +1514,13 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) { ; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %2 = sext <4 x i16> %1 to <4 x i32> %3 = load <4 x i16>, <4 x i16>* %a1, align 1 @@ -1355,6 +1565,13 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) { ; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxwq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> %2 = sext <2 x i16> %1 to <2 x i64> %3 = load <2 x i16>, <2 x i16>* %a1, align 1 @@ -1399,6 +1616,13 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) { ; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = zext <8 x i8> %1 to <8 x i16> %3 = load <8 x i8>, <8 x i8>* %a1, align 1 @@ -1443,6 +1667,13 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) { ; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %2 = zext <4 x i8> %1 to <4 x i32> %3 = load <4 x i8>, <4 x i8>* %a1, align 1 @@ -1487,6 +1718,13 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) { ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> %2 = zext <2 x i8> %1 to <2 x i64> %3 = load <2 x i8>, <2 x i8>* %a1, align 1 @@ -1531,6 +1769,13 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) { ; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> %2 = zext <2 x i32> %1 to <2 x i64> %3 = load <2 x i32>, <2 x i32>* %a1, align 1 @@ -1575,6 +1820,13 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) { ; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %2 = zext <4 x i16> %1 to <4 x i32> %3 = load <4 x i16>, <4 x i16>* %a1, align 1 @@ -1619,6 +1871,13 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) { ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxwq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> %2 = zext <2 x i16> %1 to <2 x i64> %3 = load <2 x i16>, <2 x i16>* %a1, align 1 @@ -1657,6 +1916,12 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmuldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <2 x i64> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -1695,6 +1960,12 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = mul <4 x i32> %1, %2 @@ -1751,6 +2022,16 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ptest: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vptest (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: setb %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %cl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2) @@ -1795,6 +2076,13 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) { ; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:1.00] +; ZNVER1-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7) @@ -1839,6 +2127,13 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) { ; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:1.00] +; ZNVER1-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7) @@ -1884,6 +2179,13 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) %2 = load <2 x double>, <2 x double>* %a2, align 16 %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7) @@ -1929,6 +2231,13 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> * ; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7) diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll index 7ce9ffdbd0ea1..adf857e121797 100644 --- a/test/CodeGen/X86/sse42-schedule.ll +++ b/test/CodeGen/X86/sse42-schedule.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) { ; GENERIC-LABEL: crc32_32_8: @@ -43,6 +43,13 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) { ; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_8: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1) %2 = load i8, i8 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2) @@ -85,6 +92,13 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) { ; BTVER2-NEXT: crc32w (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32w %si, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32w (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1) %2 = load i16, i16 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2) @@ -127,6 +141,13 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) { ; BTVER2-NEXT: crc32l (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32l %esi, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32l (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1) %2 = load i32, i32 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2) @@ -169,6 +190,13 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind { ; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_64_8: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1) %2 = load i8, i8 *%a2 %3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2) @@ -211,6 +239,13 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) { ; BTVER2-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_64_64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] +; ZNVER1-NEXT: crc32q (%rdx), %rdi # sched: [10:1.00] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1) %2 = load i64, i64 *%a2 %3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2) @@ -283,6 +318,19 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> ; BTVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpestri: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: movl %ecx, %esi # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; ZNVER1-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7) @@ -341,6 +389,16 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17] ; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpestrm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) @@ -393,6 +451,15 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> ; BTVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpistri: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> +; ZNVER1-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7) @@ -431,6 +498,12 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpistrm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7) @@ -468,6 +541,12 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; BTVER2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <2 x i64> %a0, %a1 %2 = sext <2 x i1> %1 to <2 x i64> %3 = load <2 x i64>, <2 x i64>*%a2, align 16 diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll index 11afdb7989f15..9ad6b0dfd4d61 100644 --- a/test/CodeGen/X86/sse4a-schedule.ll +++ b/test/CodeGen/X86/sse4a-schedule.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1 define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) { ; GENERIC-LABEL: test_extrq: @@ -11,8 +11,13 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) { ; ; BTVER2-LABEL: test_extrq: ; BTVER2: # BB#0: -; BTVER2-NEXT: extrq %xmm1, %xmm0 +; BTVER2-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: extrq %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1) ret <2 x i64> %1 } @@ -26,8 +31,13 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) { ; ; BTVER2-LABEL: test_extrqi: ; BTVER2: # BB#0: -; BTVER2-NEXT: extrq $2, $3, %xmm0 +; BTVER2-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extrqi: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: extrq $2, $3, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2) ret <2 x i64> %1 } @@ -41,8 +51,13 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) { ; ; BTVER2-LABEL: test_insertq: ; BTVER2: # BB#0: -; BTVER2-NEXT: insertq %xmm1, %xmm0 +; BTVER2-NEXT: insertq %xmm1, %xmm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: insertq %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1) ret <2 x i64> %1 } @@ -56,8 +71,13 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) { ; ; BTVER2-LABEL: test_insertqi: ; BTVER2: # BB#0: -; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 +; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertqi: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6) ret <2 x i64> %1 } @@ -73,6 +93,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) { ; BTVER2: # BB#0: ; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movntsd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) ret void } @@ -88,6 +113,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) { ; BTVER2: # BB#0: ; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movntss %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) ret void } diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll index f24969a30c337..24ace69ebb9e2 100644 --- a/test/CodeGen/X86/ssse3-schedule.ll +++ b/test/CodeGen/X86/ssse3-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) { ; GENERIC-LABEL: test_pabsb: @@ -52,6 +52,13 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) { ; BTVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsb (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) %2 = load <16 x i8>, <16 x i8> *%a1, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2) @@ -103,6 +110,13 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2) @@ -147,6 +161,11 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2) @@ -196,6 +215,12 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50] ; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_palignr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.25] +; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> @@ -238,6 +263,12 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2) @@ -289,6 +320,12 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -332,6 +369,12 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2) @@ -375,6 +418,12 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2) @@ -426,6 +475,12 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -469,6 +524,12 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2) @@ -512,6 +573,12 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaddubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = bitcast <8 x i16> %1 to <16 x i8> @@ -550,6 +617,11 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhrsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -593,6 +665,12 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2) @@ -644,6 +722,12 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; BTVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2) @@ -695,6 +779,12 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; BTVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2) @@ -746,6 +836,12 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; BTVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2) diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll index 29f8e3ed4f789..784b932addc85 100644 --- a/test/CodeGen/X86/statepoint-invoke.ll +++ b/test/CodeGen/X86/statepoint-invoke.ll @@ -95,8 +95,8 @@ left.relocs: right: ; CHECK-LABEL: %right - ; CHECK: movq ; CHECK: movq %rdx, (%rsp) + ; CHECK: movq ; CHECK: callq some_call %sp2 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3) to label %right.relocs unwind label %exceptional_return.right diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll index b16426eae3d5c..6e7fc7bf1c079 100644 --- a/test/CodeGen/X86/statepoint-stack-usage.ll +++ b/test/CodeGen/X86/statepoint-stack-usage.ll @@ -11,9 +11,9 @@ target triple = "x86_64-pc-linux-gnu" define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" { ; CHECK-LABEL: back_to_back_calls ; The exact stores don't matter, but there need to be three stack slots created -; CHECK: movq %rdi, 16(%rsp) -; CHECK: movq %rdx, 8(%rsp) -; CHECK: movq %rsi, (%rsp) +; CHECK-DAG: movq %rdi, 16(%rsp) +; CHECK-DAG: movq %rdx, 8(%rsp) +; CHECK-DAG: movq %rsi, (%rsp) ; There should be no more than three moves ; CHECK-NOT: movq %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) @@ -36,9 +36,9 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" { ; CHECK-LABEL: reserve_first ; The exact stores don't matter, but there need to be three stack slots created -; CHECK: movq %rdi, 16(%rsp) -; CHECK: movq %rdx, 8(%rsp) -; CHECK: movq %rsi, (%rsp) +; CHECK-DAG: movq %rdi, 16(%rsp) +; CHECK-DAG: movq %rdx, 8(%rsp) +; CHECK-DAG: movq %rsi, (%rsp) %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12) %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13) @@ -61,21 +61,21 @@ define i32 @back_to_back_deopt(i32 %a, i32 %b, i32 %c) #1 gc "statepoint-example" { ; CHECK-LABEL: back_to_back_deopt ; The exact stores don't matter, but there need to be three stack slots created -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq -; CHECK: movl %ebx, 12(%rsp) -; CHECK: movl %ebp, 8(%rsp) -; CHECK: movl %r14d, 4(%rsp) +; CHECK-DAG: movl %ebx, 12(%rsp) +; CHECK-DAG: movl %ebp, 8(%rsp) +; CHECK-DAG: movl %r14d, 4(%rsp) ; CHECK: callq call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c) call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c) @@ -89,9 +89,9 @@ define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 ; CHECK-LABEL: back_to_back_invokes entry: ; The exact stores don't matter, but there need to be three stack slots created - ; CHECK: movq %rdi, 16(%rsp) - ; CHECK: movq %rdx, 8(%rsp) - ; CHECK: movq %rsi, (%rsp) + ; CHECK-DAG: movq %rdi, 16(%rsp) + ; CHECK-DAG: movq %rdx, 8(%rsp) + ; CHECK-DAG: movq %rsi, (%rsp) ; CHECK: callq %safepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) to label %normal_return unwind label %exceptional_return diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll index 5bc8f983ff06b..538d175649576 100644 --- a/test/CodeGen/X86/statepoint-vector.ll +++ b/test/CodeGen/X86/statepoint-vector.ll @@ -49,8 +49,8 @@ entry: ; CHECK: subq $40, %rsp ; CHECK: testb $1, %dil ; CHECK: movaps (%rsi), %xmm0 -; CHECK: movaps %xmm0, 16(%rsp) -; CHECK: movaps %xmm0, (%rsp) +; CHECK-DAG: movaps %xmm0, (%rsp) +; CHECK-DAG: movaps %xmm0, 16(%rsp) ; CHECK: callq do_safepoint ; CHECK: movaps (%rsp), %xmm0 ; CHECK: addq $40, %rsp diff --git a/test/CodeGen/X86/vec_cmp_uint-128.ll b/test/CodeGen/X86/vec_cmp_uint-128.ll index 8bed14e7e5f5f..cad7991c4f3b5 100644 --- a/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -463,7 +463,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX2-LABEL: gt_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 @@ -476,7 +476,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX512-LABEL: gt_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 @@ -782,7 +782,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX2-LABEL: lt_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 @@ -795,7 +795,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX512-LABEL: lt_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 2b5eb695f53ea..87cf2026d1ef4 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -135,7 +135,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 @@ -433,7 +433,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 @@ -444,7 +444,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] ; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll index e7bfe3778212c..ce0ec6c3875ad 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -115,7 +115,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 @@ -381,7 +381,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 @@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2 ; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll index cd17fcf8c85b4..8138442b3eafd 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -130,7 +130,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -412,7 +412,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 @@ -423,7 +423,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] ; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll index 4adc2e2fb6c90..b0433110f1818 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -123,7 +123,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_div7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 @@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; ; AVX2-LABEL: test_rem7_8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 @@ -403,7 +403,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index 6719a66f030f8..c65c3e7fd004f 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -73,7 +73,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: PR20355: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll index 852c1f4d3d981..04378ee2ee012 100644 --- a/test/CodeGen/X86/vector-rotate-128.ll +++ b/test/CodeGen/X86/vector-rotate-128.ll @@ -77,14 +77,19 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; AVX512-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v2i64: ; XOP: # BB#0: @@ -207,21 +212,26 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; AVX512-NEXT: vpsubd %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v4i32: ; XOP: # BB#0: @@ -844,28 +854,24 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; XOPAVX1-LABEL: constant_rotate_v2i64: -; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: constant_rotate_v2i64: -; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; AVX512VL-LABEL: constant_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; XOP-LABEL: constant_rotate_v2i64: +; XOP: # BB#0: +; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v2i64: ; X32-SSE: # BB#0: @@ -951,26 +957,24 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; -; XOPAVX1-LABEL: constant_rotate_v4i32: -; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq +; AVX512VL-LABEL: constant_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; -; XOPAVX2-LABEL: constant_rotate_v4i32: -; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: constant_rotate_v4i32: +; XOP: # BB#0: +; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v4i32: ; X32-SSE: # BB#0: @@ -1100,11 +1104,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; ; XOP-LABEL: constant_rotate_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v8i16: @@ -1281,11 +1281,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; ; XOP-LABEL: constant_rotate_v16i8: ; XOP: # BB#0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X32-SSE-LABEL: constant_rotate_v16i8: @@ -1371,12 +1367,18 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $14, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlq $50, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v2i64: ; XOP: # BB#0: @@ -1412,12 +1414,18 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $28, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v4i32: ; XOP: # BB#0: @@ -1544,11 +1552,19 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsrlq $49, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v2i64: ; XOP: # BB#0: @@ -1595,14 +1611,19 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $28, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v4i32: ; XOP: # BB#0: diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll index 14215e486bf9e..3b65b68352b5b 100644 --- a/test/CodeGen/X86/vector-rotate-256.ll +++ b/test/CodeGen/X86/vector-rotate-256.ll @@ -41,21 +41,25 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64] ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; AVX512-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: var_rotate_v4i64: ; XOPAVX1: # BB#0: @@ -128,21 +132,25 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32] ; AVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; AVX512-NEXT: vpsubd %ymm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: var_rotate_v8i32: ; XOPAVX1: # BB#0: @@ -466,7 +474,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3 @@ -483,36 +491,36 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: constant_rotate_v4i64: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm4 -; XOPAVX1-NEXT: vpshlq %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpshlq %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60> - %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 2> + %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4> %or = or <4 x i64> %shl, %lshr ret <4 x i64> %or } @@ -549,30 +557,33 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: constant_rotate_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1 -; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21> @@ -643,30 +654,18 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; ; XOPAVX1-LABEL: constant_rotate_v16i16: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm4 -; XOPAVX1-NEXT: vpshlw %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpshlw %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm3 -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3 -; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm2 -; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1> @@ -768,32 +767,20 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; ; XOPAVX1-LABEL: constant_rotate_v32i8: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: constant_rotate_v32i8: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm3 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm3, %xmm3 -; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> @@ -825,12 +812,17 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $14, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlq $50, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v4i64: ; XOPAVX1: # BB#0: @@ -873,12 +865,17 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $28, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v8i32: ; XOPAVX1: # BB#0: @@ -1027,11 +1024,18 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX1: # BB#0: @@ -1082,14 +1086,18 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $28, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: ; XOPAVX1: # BB#0: diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll new file mode 100644 index 0000000000000..fa1b5c1c0cb4a --- /dev/null +++ b/test/CodeGen/X86/vector-rotate-512.ll @@ -0,0 +1,831 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VLBW + +; +; Variable Rotates +; + +define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: var_rotate_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b + %shl = shl <8 x i64> %a, %b + %lshr = lshr <8 x i64> %a, %b64 + %or = or <8 x i64> %shl, %lshr + ret <8 x i64> %or +} + +define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: var_rotate_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b + %shl = shl <16 x i32> %a, %b + %lshr = lshr <16 x i32> %a, %b32 + %or = or <16 x i32> %shl, %lshr + ret <16 x i32> %or +} + +define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512F-LABEL: var_rotate_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5 +; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: var_rotate_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: var_rotate_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b + %shl = shl <32 x i16> %a, %b + %lshr = lshr <32 x i16> %a, %b16 + %or = or <32 x i16> %shl, %lshr + ret <32 x i16> %or +} + +define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512F-LABEL: var_rotate_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm4 +; AVX512F-NEXT: vpsubb %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm1, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512F-NEXT: vpand %ymm9, %ymm7, %ymm7 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm9, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %ymm2, %ymm5, %ymm4 +; AVX512VL-NEXT: vpsubb %ymm3, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm1, %ymm6 +; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm6 +; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512VL-NEXT: vpand %ymm9, %ymm7, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm9, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm9, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: var_rotate_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: var_rotate_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLBW-NEXT: retq + %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b + %shl = shl <64 x i8> %a, %b + %lshr = lshr <64 x i8> %a, %b8 + %or = or <64 x i8> %shl, %lshr + ret <64 x i8> %or +} + +; +; Constant Rotates +; + +define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind { +; AVX512-LABEL: constant_rotate_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60> + %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4> + %or = or <8 x i64> %shl, %lshr + ret <8 x i64> %or +} + +define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind { +; AVX512-LABEL: constant_rotate_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> + %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21> + %or = or <16 x i32> %shl, %lshr + ret <16 x i32> %or +} + +define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind { +; AVX512F-LABEL: constant_rotate_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_rotate_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: constant_rotate_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1> + %or = or <32 x i16> %shl, %lshr + ret <32 x i16> %or +} + +define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { +; AVX512F-LABEL: constant_rotate_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm10 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm10 +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: constant_rotate_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: constant_rotate_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536] +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} +; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> + %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> + %or = or <64 x i8> %shl, %lshr + ret <64 x i8> %or +} + +; +; Uniform Constant Rotates +; + +define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14> + %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50> + %or = or <8 x i64> %shl, %lshr + ret <8 x i64> %or +} + +define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> + %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> + %or = or <16 x i32> %shl, %lshr + ret <16 x i32> %or +} + +define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> + %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> + %or = or <32 x i16> %shl, %lshr + ret <32 x i16> %or +} + +define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> + %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> + %or = or <64 x i8> %shl, %lshr + ret <64 x i8> %or +} + +; +; Masked Uniform Constant Rotates +; + +define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_mask_v8i64: +; AVX512: # BB#0: +; AVX512-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15> + %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49> + %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255> + %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257> + %or = or <8 x i64> %lmask, %rmask + ret <8 x i64> %or +} + +define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { +; AVX512-LABEL: splatconstant_rotate_mask_v16i32: +; AVX512: # BB#0: +; AVX512-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq + %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> + %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> + %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511> + %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3> + %or = or <16 x i32> %lmask, %rmask + ret <16 x i32> %or +} + +define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm2 +; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm3 +; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> + %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> + %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> + %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> + %or = or <32 x i16> %lmask, %rmask + ret <32 x i16> %or +} + +define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { +; AVX512F-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: retq + %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> + %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> + %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> + %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> + %or = or <64 x i8> %lmask, %rmask + ret <64 x i8> %or +} diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index 09e143ddcd4d6..5f2b18fc9c03a 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -45,7 +45,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; AVX2-LABEL: var_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 @@ -66,7 +66,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; XOPAVX2-LABEL: var_shift_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 @@ -667,7 +667,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; AVX2-LABEL: splatvar_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -687,7 +687,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v4i64: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -1700,7 +1700,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-LABEL: splatconstant_shift_v4i64: ; XOPAVX2: # BB#0: ; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72057594037927936,72057594037927936,72057594037927936,72057594037927936] ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 820178d2d9927..5f00e55e225ba 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -745,7 +745,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; @@ -755,7 +755,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index 30e5661d54859..4a7d25c1376e5 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -179,7 +179,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; @@ -189,7 +189,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; @@ -432,7 +432,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; @@ -442,7 +442,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 3bf677aadf195..2fce8a6019313 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -89,7 +89,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; @@ -99,7 +99,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; @@ -235,7 +235,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; @@ -245,7 +245,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 +; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll index 5503cfc357e52..5825a56b6f99b 100644 --- a/test/CodeGen/X86/vselect-avx.ll +++ b/test/CodeGen/X86/vselect-avx.ll @@ -58,8 +58,8 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: movq (%rdi,%rsi,8), %rax -; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.5,-0.5,-0.5,-0.5] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.5,0.5,0.5,0.5] ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovupd %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -108,7 +108,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, ; ; AVX2-LABEL: test3: ; AVX2: ## BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm3 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766] ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 @@ -117,7 +117,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] ; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4 ; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3] ; AVX2-NEXT: vpmulld %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 diff --git a/test/CodeGen/X86/widen_arith-2.ll b/test/CodeGen/X86/widen_arith-2.ll index 48753ad4fd762..5731b63f3bc14 100644 --- a/test/CodeGen/X86/widen_arith-2.ll +++ b/test/CodeGen/X86/widen_arith-2.ll @@ -16,20 +16,17 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; CHECK-NEXT: .LBB0_2: # %forbody ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl (%esp), %eax -; CHECK-NEXT: shll $3, %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl (%esp), %eax -; CHECK-NEXT: shll $3, %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl (%esp), %ecx +; CHECK-NEXT: leal (,%eax,8), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; CHECK-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; CHECK-NEXT: psubw %xmm0, %xmm3 ; CHECK-NEXT: pand %xmm1, %xmm3 ; CHECK-NEXT: pshufb %xmm2, %xmm3 -; CHECK-NEXT: movq %xmm3, (%edx,%ecx,8) +; CHECK-NEXT: movq %xmm3, (%edx,%eax,8) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll index e55d62a461aa5..cc6fb27a62938 100644 --- a/test/CodeGen/X86/widen_cast-4.ll +++ b/test/CodeGen/X86/widen_cast-4.ll @@ -16,22 +16,19 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; NARROW-NEXT: .LBB0_2: # %forbody ; NARROW-NEXT: # in Loop: Header=BB0_1 Depth=1 ; NARROW-NEXT: movl (%esp), %eax -; NARROW-NEXT: shll $3, %eax -; NARROW-NEXT: addl {{[0-9]+}}(%esp), %eax -; NARROW-NEXT: movl %eax, {{[0-9]+}}(%esp) -; NARROW-NEXT: movl (%esp), %eax -; NARROW-NEXT: shll $3, %eax -; NARROW-NEXT: addl {{[0-9]+}}(%esp), %eax -; NARROW-NEXT: movl %eax, {{[0-9]+}}(%esp) -; NARROW-NEXT: movl (%esp), %ecx +; NARROW-NEXT: leal (,%eax,8), %ecx ; NARROW-NEXT: movl {{[0-9]+}}(%esp), %edx +; NARROW-NEXT: addl %ecx, %edx +; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp) +; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx +; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; NARROW-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; NARROW-NEXT: psubw %xmm0, %xmm2 ; NARROW-NEXT: psllw $8, %xmm2 ; NARROW-NEXT: psraw $8, %xmm2 ; NARROW-NEXT: psraw $2, %xmm2 ; NARROW-NEXT: pshufb %xmm1, %xmm2 -; NARROW-NEXT: movq %xmm2, (%edx,%ecx,8) +; NARROW-NEXT: movq %xmm2, (%edx,%eax,8) ; NARROW-NEXT: incl (%esp) ; NARROW-NEXT: .LBB0_1: # %forcond ; NARROW-NEXT: # =>This Inner Loop Header: Depth=1 @@ -54,24 +51,21 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; WIDE-NEXT: .LBB0_2: # %forbody ; WIDE-NEXT: # in Loop: Header=BB0_1 Depth=1 ; WIDE-NEXT: movl (%esp), %eax -; WIDE-NEXT: shll $3, %eax -; WIDE-NEXT: addl {{[0-9]+}}(%esp), %eax -; WIDE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; WIDE-NEXT: movl (%esp), %eax -; WIDE-NEXT: shll $3, %eax -; WIDE-NEXT: addl {{[0-9]+}}(%esp), %eax -; WIDE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; WIDE-NEXT: movl (%esp), %ecx +; WIDE-NEXT: leal (,%eax,8), %ecx ; WIDE-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIDE-NEXT: addl %ecx, %edx +; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIDE-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; WIDE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; WIDE-NEXT: pinsrd $1, 4(%eax,%ecx,8), %xmm3 +; WIDE-NEXT: pinsrd $1, 4(%ecx,%eax,8), %xmm3 ; WIDE-NEXT: psubb %xmm0, %xmm3 ; WIDE-NEXT: psrlw $2, %xmm3 ; WIDE-NEXT: pand %xmm1, %xmm3 ; WIDE-NEXT: pxor %xmm2, %xmm3 ; WIDE-NEXT: psubb %xmm2, %xmm3 -; WIDE-NEXT: pextrd $1, %xmm3, 4(%edx,%ecx,8) -; WIDE-NEXT: movd %xmm3, (%edx,%ecx,8) +; WIDE-NEXT: pextrd $1, %xmm3, 4(%edx,%eax,8) +; WIDE-NEXT: movd %xmm3, (%edx,%eax,8) ; WIDE-NEXT: incl (%esp) ; WIDE-NEXT: .LBB0_1: # %forcond ; WIDE-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/test/CodeGen/X86/win64-nosse-csrs.ll b/test/CodeGen/X86/win64-nosse-csrs.ll index d1860b721044c..29d4f165392e3 100644 --- a/test/CodeGen/X86/win64-nosse-csrs.ll +++ b/test/CodeGen/X86/win64-nosse-csrs.ll @@ -20,7 +20,7 @@ entry-block: } ; Function Attrs: nounwind uwtable -define x86_64_win64cc i64 @peach() unnamed_addr #1 { +define win64cc i64 @peach() unnamed_addr #1 { entry-block: %0 = call i64 @banana() ret i64 %0 diff --git a/test/CodeGen/X86/win64_nonvol.ll b/test/CodeGen/X86/win64_nonvol.ll index 8e5f6cec1ab70..e1c615d75f282 100644 --- a/test/CodeGen/X86/win64_nonvol.ll +++ b/test/CodeGen/X86/win64_nonvol.ll @@ -5,7 +5,7 @@ ; Win64 nonvolatile registers get saved. ; CHECK-LABEL: bar: -define x86_64_win64cc void @bar(i32 %a, i32 %b) { +define win64cc void @bar(i32 %a, i32 %b) { ; CHECK-DAG: pushq %rdi ; CHECK-DAG: pushq %rsi ; CHECK-DAG: movaps %xmm6, diff --git a/test/CodeGen/X86/win64_params.ll b/test/CodeGen/X86/win64_params.ll index a0b552d4d5847..6b42735120137 100644 --- a/test/CodeGen/X86/win64_params.ll +++ b/test/CodeGen/X86/win64_params.ll @@ -12,7 +12,7 @@ entry: ret i32 %add } -define x86_64_win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize { +define win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize { entry: ; CHECK: movl 48(%rsp), %eax ; CHECK: addl 40(%rsp), %eax diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll index 0faa24ef7290d..c7550a467a352 100644 --- a/test/CodeGen/X86/win_chkstk.ll +++ b/test/CodeGen/X86/win_chkstk.ll @@ -51,7 +51,7 @@ entry: ; Make sure we don't call __chkstk or __alloca on non-Windows even if the ; caller has the Win64 calling convention. -define x86_64_win64cc i32 @main4k_win64() nounwind { +define win64cc i32 @main4k_win64() nounwind { entry: ; WIN_X32: calll __chkstk ; WIN_X64: callq __chkstk diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll index c9a5fc2b32884..b4b8010ec564e 100644 --- a/test/CodeGen/X86/win_coreclr_chkstk.ll +++ b/test/CodeGen/X86/win_coreclr_chkstk.ll @@ -103,7 +103,7 @@ entry: ; Make sure we don't emit the probe sequence if not on windows even if the ; caller has the Win64 calling convention. -define x86_64_win64cc i32 @main4k_win64() nounwind { +define win64cc i32 @main4k_win64() nounwind { entry: ; WIN_X64: movq %gs:16, %rcx ; LINUX-NOT: movq %gs:16, %rcx @@ -115,7 +115,7 @@ entry: declare i32 @bar(i8*) nounwind ; Within-body inline probe expansion -define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind { +define win64cc i32 @main4k_alloca(i64 %n) nounwind { entry: ; WIN_X64: callq bar ; WIN_X64: movq %gs:16, [[R:%r.*]] diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll index 299190e8a595e..e3387a2709cba 100644 --- a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll +++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll @@ -3,7 +3,7 @@ ; Verify that the var arg parameters which are passed in registers are stored ; in home stack slots allocated by the caller and that AP is correctly ; calculated. -define x86_64_win64cc void @average_va(i32 %count, ...) nounwind { +define win64cc void @average_va(i32 %count, ...) nounwind { entry: ; CHECK: pushq ; CHECK: movq %r9, 40(%rsp) @@ -24,7 +24,7 @@ declare void @llvm.va_end(i8*) nounwind ; CHECK-LABEL: f5: ; CHECK: pushq ; CHECK: leaq 56(%rsp), -define x86_64_win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind { +define win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* @@ -35,7 +35,7 @@ entry: ; CHECK-LABEL: f4: ; CHECK: pushq ; CHECK: leaq 48(%rsp), -define x86_64_win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { +define win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* @@ -46,7 +46,7 @@ entry: ; CHECK-LABEL: f3: ; CHECK: pushq ; CHECK: leaq 40(%rsp), -define x86_64_win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind { +define win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* @@ -62,7 +62,7 @@ entry: ; CHECK: movq [[REG_copy1]], 8(%rsp) ; CHECK: movq [[REG_copy1]], (%rsp) ; CHECK: ret -define x86_64_win64cc void @copy1(i64 %a0, ...) nounwind { +define win64cc void @copy1(i64 %a0, ...) nounwind { entry: %ap = alloca i8*, align 8 %cp = alloca i8*, align 8 @@ -78,7 +78,7 @@ entry: ; CHECK: movq [[REG_copy4]], 8(%rsp) ; CHECK: movq [[REG_copy4]], (%rsp) ; CHECK: ret -define x86_64_win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { +define win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { entry: %ap = alloca i8*, align 8 %cp = alloca i8*, align 8 @@ -96,7 +96,7 @@ entry: ; CHECK: movq [[REG_arg4_2]], (%rsp) ; CHECK: movl 48(%rsp), %eax ; CHECK: ret -define x86_64_win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { +define win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind { entry: %ap = alloca i8*, align 8 %ap.0 = bitcast i8** %ap to i8* diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll new file mode 100644 index 0000000000000..39877c14429ff --- /dev/null +++ b/test/CodeGen/X86/x86-cmov-converter.ll @@ -0,0 +1,321 @@ +; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; This test checks that x86-cmov-converter optimization transform CMOV +;; instruction into branches when it is profitable. +;; There are 5 cases below: +;; 1. CmovInCriticalPath: +;; CMOV depends on the condition and it is in the hot path. +;; Thus, it worths transforming. +;; +;; 2. CmovNotInCriticalPath: +;; similar test like in (1), just that CMOV is not in the hot path. +;; Thus, it does not worth transforming. +;; +;; 3. MaxIndex: +;; Maximum calculation algorithm that is looking for the max index, +;; calculating CMOV value is cheaper than calculating CMOV condition. +;; Thus, it worths transforming. +;; +;; 4. MaxValue: +;; Maximum calculation algorithm that is looking for the max value, +;; calculating CMOV value is not cheaper than calculating CMOV condition. +;; Thus, it does not worth transforming. +;; +;; 5. BinarySearch: +;; Usually, binary search CMOV is not predicted. +;; Thus, it does not worth transforming. +;; +;; Test was created using the following command line: +;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o - +;; Where foo.c is: +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void CmovInHotPath(int n, int a, int b, int *c, int *d) { +;; for (int i = 0; i < n; i++) { +;; int t = c[i]; +;; if (c[i] * a > b) +;; t = 10; +;; c[i] = t; +;; } +;;} +;; +;; +;;void CmovNotInHotPath(int n, int a, int b, int *c, int *d) { +;; for (int i = 0; i < n; i++) { +;; int t = c[i]; +;; if (c[i] * a > b) +;; t = 10; +;; c[i] = t; +;; d[i] /= b; +;; } +;;} +;; +;; +;;int MaxIndex(int n, int *a) { +;; int t = 0; +;; for (int i = 1; i < n; i++) { +;; if (a[i] > a[t]) +;; t = i; +;; } +;; return a[t]; +;;} +;; +;; +;;int MaxValue(int n, int *a) { +;; int t = a[0]; +;; for (int i = 1; i < n; i++) { +;; if (a[i] > t) +;; t = a[i]; +;; } +;; return t; +;;} +;; +;;typedef struct Node Node; +;;struct Node { +;; unsigned Val; +;; Node *Right; +;; Node *Left; +;;}; +;; +;;unsigned BinarySearch(unsigned Mask, Node *Curr, Node *Next) { +;; while (Curr->Val > Next->Val) { +;; Curr = Next; +;; if (Mask & (0x1 << Curr->Val)) +;; Next = Curr->Right; +;; else +;; Next = Curr->Left; +;; } +;; return Curr->Val; +;;} +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%struct.Node = type { i32, %struct.Node*, %struct.Node* } + +; CHECK-LABEL: CmovInHotPath +; CHECK-NOT: cmov +; CHECK: jg + +define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 { +entry: + %cmp14 = icmp sgt i32 %n, 0 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %a + %cmp3 = icmp sgt i32 %mul, %b + %. = select i1 %cmp3, i32 10, i32 %0 + store i32 %., i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: CmovNotInHotPath +; CHECK: cmovg + +define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture %d) #0 { +entry: + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %a + %cmp3 = icmp sgt i32 %mul, %b + %. = select i1 %cmp3, i32 10, i32 %0 + store i32 %., i32* %arrayidx, align 4 + %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv + %1 = load i32, i32* %arrayidx7, align 4 + %div = sdiv i32 %1, %b + store i32 %div, i32* %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: MaxIndex +; CHECK-NOT: cmov +; CHECK: jg + +define i32 @MaxIndex(i32 %n, i32* nocapture readonly %a) #0 { +entry: + %cmp14 = icmp sgt i32 %n, 1 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %phitmp = sext i32 %i.0.t.0 to i64 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %t.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %t.0.lcssa + %0 = load i32, i32* %arrayidx5, align 4 + ret i32 %0 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ] + %t.015 = phi i32 [ %i.0.t.0, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %idxprom1 = sext i32 %t.015 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1 + %2 = load i32, i32* %arrayidx2, align 4 + %cmp3 = icmp sgt i32 %1, %2 + %3 = trunc i64 %indvars.iv to i32 + %i.0.t.0 = select i1 %cmp3, i32 %3, i32 %t.015 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: MaxValue +; CHECK-NOT: jg +; CHECK: cmovg + +define i32 @MaxValue(i32 %n, i32* nocapture readonly %a) #0 { +entry: + %0 = load i32, i32* %a, align 4 + %cmp13 = icmp sgt i32 %n, 1 + br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %t.0.lcssa = phi i32 [ %0, %entry ], [ %.t.0, %for.body ] + ret i32 %t.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ] + %t.014 = phi i32 [ %.t.0, %for.body ], [ %0, %for.body.preheader ] + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %1 = load i32, i32* %arrayidx1, align 4 + %cmp2 = icmp sgt i32 %1, %t.014 + %.t.0 = select i1 %cmp2, i32 %1, i32 %t.014 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: BinarySearch +; CHECK: cmov + +define i32 @BinarySearch(i32 %Mask, %struct.Node* nocapture readonly %Curr, %struct.Node* nocapture readonly %Next) #0 { +entry: + %Val8 = getelementptr inbounds %struct.Node, %struct.Node* %Curr, i64 0, i32 0 + %0 = load i32, i32* %Val8, align 8 + %Val19 = getelementptr inbounds %struct.Node, %struct.Node* %Next, i64 0, i32 0 + %1 = load i32, i32* %Val19, align 8 + %cmp10 = icmp ugt i32 %0, %1 + br i1 %cmp10, label %while.body, label %while.end + +while.body: ; preds = %entry, %while.body + %2 = phi i32 [ %4, %while.body ], [ %1, %entry ] + %Next.addr.011 = phi %struct.Node* [ %3, %while.body ], [ %Next, %entry ] + %shl = shl i32 1, %2 + %and = and i32 %shl, %Mask + %tobool = icmp eq i32 %and, 0 + %Left = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 2 + %Right = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 1 + %Left.sink = select i1 %tobool, %struct.Node** %Left, %struct.Node** %Right + %3 = load %struct.Node*, %struct.Node** %Left.sink, align 8 + %Val1 = getelementptr inbounds %struct.Node, %struct.Node* %3, i64 0, i32 0 + %4 = load i32, i32* %Val1, align 8 + %cmp = icmp ugt i32 %2, %4 + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + %.lcssa = phi i32 [ %0, %entry ], [ %2, %while.body ] + ret i32 %.lcssa +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; The following test checks that x86-cmov-converter optimization transforms +;; CMOV instructions into branch correctly. +;; +;; MBB: +;; cond = cmp ... +;; v1 = CMOVgt t1, f1, cond +;; v2 = CMOVle s1, f2, cond +;; +;; Where: t1 = 11, f1 = 22, f2 = a +;; +;; After CMOV transformation +;; ------------------------- +;; MBB: +;; cond = cmp ... +;; ja %SinkMBB +;; +;; FalseMBB: +;; jmp %SinkMBB +;; +;; SinkMBB: +;; %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] +;; %v2 = phi[%f1, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch +;; ; true-value with false-value +;; ; Phi instruction cannot use +;; ; previous Phi instruction result +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; CHECK-LABEL: Transform +; CHECK-NOT: cmov +; CHECK: divl [[a:%[0-9a-z]*]] +; CHECK: cmpl [[a]], %eax +; CHECK: movl $11, [[s1:%[0-9a-z]*]] +; CHECK: movl [[a]], [[s2:%[0-9a-z]*]] +; CHECK: ja [[SinkBB:.*]] +; CHECK: [[FalseBB:.*]]: +; CHECK: movl $22, [[s1]] +; CHECK: movl $22, [[s2]] +; CHECK: [[SinkBB]]: +; CHECK: ja + +define void @Transform(i32 *%arr, i32 *%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 { +entry: + %cmp10 = icmp ugt i32 0, %n + br i1 %cmp10, label %while.body, label %while.end + +while.body: ; preds = %entry, %while.body + %i = phi i32 [ %i_inc, %while.body ], [ 0, %entry ] + %arr_i = getelementptr inbounds i32, i32* %arr, i32 %i + %x = load i32, i32* %arr_i, align 4 + %div = udiv i32 %x, %a + %cond = icmp ugt i32 %div, %a + %condOpp = icmp ule i32 %div, %a + %s1 = select i1 %cond, i32 11, i32 22 + %s2 = select i1 %condOpp, i32 %s1, i32 %a + %sum = urem i32 %s1, %s2 + store i32 %sum, i32* %arr_i, align 4 + %i_inc = add i32 %i, 1 + %cmp = icmp ugt i32 %i_inc, %n + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +attributes #0 = {"target-cpu"="x86-64"} |