summaryrefslogtreecommitdiff
path: root/test/CodeGen/X86/sse2-schedule.ll
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/X86/sse2-schedule.ll')
-rw-r--r--test/CodeGen/X86/sse2-schedule.ll824
1 files changed, 823 insertions, 1 deletions
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index 3c36b21381390..6ee908e0c7871 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -7,7 +7,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_addpd:
@@ -45,6 +45,12 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fadd <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fadd <2 x double> %1, %2
@@ -87,6 +93,12 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fadd double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fadd double %1, %2
@@ -135,6 +147,13 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = and <4 x i32> %1, %2
@@ -188,6 +207,13 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; BTVER2-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andnotpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -243,6 +269,13 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmppd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fcmp oeq <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fcmp oeq <2 x double> %a0, %2
@@ -288,6 +321,12 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
; BTVER2-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <2 x double> undef, double %a0, i32 0
%2 = insertelement <2 x double> undef, double %a1, i32 0
%3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0)
@@ -383,6 +422,20 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50]
; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_comisd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT: vcomisd (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT: sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 8
%3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -433,6 +486,13 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; BTVER2-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2pd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%2 = sitofp <2 x i32> %1 to <2 x double>
%3 = load <4 x i32>, <4 x i32>*%a1, align 16
@@ -485,6 +545,13 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
; BTVER2-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2ps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sitofp <4 x i32> %a0 to <4 x float>
%2 = load <4 x i32>, <4 x i32>*%a1, align 16
%3 = sitofp <4 x i32> %2 to <4 x float>
@@ -535,6 +602,13 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2dq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2)
@@ -586,6 +660,13 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2ps:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2)
@@ -637,6 +718,13 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2dq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2)
@@ -688,6 +776,13 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2pd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
%2 = fpext <2 x float> %1 to <2 x double>
%3 = load <4 x float>, <4 x float> *%a1, align 16
@@ -739,6 +834,13 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
; BTVER2-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2si:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtsd2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtsd2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <2 x double> undef, double %a0, i32 0
%2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1)
%3 = load double, double *%a1, align 8
@@ -791,6 +893,13 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
; BTVER2-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2siq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtsd2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <2 x double> undef, double %a0, i32 0
%2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1)
%3 = load double, double *%a1, align 8
@@ -850,6 +959,14 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
; BTVER2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2ss:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fptrunc double %a0 to float
%2 = load double, double *%a1, align 8
%3 = fptrunc double %2 to float
@@ -899,6 +1016,13 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2sd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sitofp i32 %a0 to double
%2 = load i32, i32 *%a1, align 8
%3 = sitofp i32 %2 to double
@@ -948,6 +1072,13 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2sdq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sitofp i64 %a0 to double
%2 = load i64, i64 *%a1, align 8
%3 = sitofp i64 %2 to double
@@ -1006,6 +1137,14 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
; BTVER2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtss2sd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fpext float %a0 to double
%2 = load float, float *%a1, align 4
%3 = fpext float %2 to double
@@ -1056,6 +1195,13 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttpd2dq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fptosi <2 x double> %a0 to <2 x i32>
%2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = load <2 x double>, <2 x double> *%a1, align 16
@@ -1108,6 +1254,13 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttps2dq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fptosi <4 x float> %a0 to <4 x i32>
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = fptosi <4 x float> %2 to <4 x i32>
@@ -1157,6 +1310,13 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
; BTVER2-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttsd2si:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvttsd2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fptosi double %a0 to i32
%2 = load double, double *%a1, align 8
%3 = fptosi double %2 to i32
@@ -1206,6 +1366,13 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
; BTVER2-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttsd2siq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vcvttsd2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fptosi double %a0 to i64
%2 = load double, double *%a1, align 8
%3 = fptosi double %2 to i64
@@ -1249,6 +1416,12 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fdiv <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fdiv <2 x double> %1, %2
@@ -1291,6 +1464,12 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
; BTVER2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fdiv double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fdiv double %1, %2
@@ -1333,6 +1512,11 @@ define void @test_lfence() {
; BTVER2: # BB#0:
; BTVER2-NEXT: lfence # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lfence:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: lfence # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
call void @llvm.x86.sse2.lfence()
ret void
}
@@ -1374,6 +1558,11 @@ define void @test_mfence() {
; BTVER2: # BB#0:
; BTVER2-NEXT: mfence # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mfence:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: mfence # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
call void @llvm.x86.sse2.mfence()
ret void
}
@@ -1413,6 +1602,11 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovdqu:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2)
ret void
}
@@ -1454,6 +1648,12 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2)
@@ -1497,6 +1697,12 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2)
@@ -1540,6 +1746,12 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2)
@@ -1583,6 +1795,12 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2)
@@ -1632,6 +1850,13 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movapd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovapd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <2 x double>, <2 x double> *%a0, align 16
%2 = fadd <2 x double> %1, %1
store <2 x double> %2, <2 x double> *%a1, align 16
@@ -1680,6 +1905,13 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movdqa:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovdqa (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <2 x i64>, <2 x i64> *%a0, align 16
%2 = add <2 x i64> %1, %1
store <2 x i64> %2, <2 x i64> *%a1, align 16
@@ -1728,6 +1960,13 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movdqu:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovdqu (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <2 x i64>, <2 x i64> *%a0, align 1
%2 = add <2 x i64> %1, %1
store <2 x i64> %2, <2 x i64> *%a1, align 1
@@ -1794,6 +2033,16 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; BTVER2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.17]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT: vmovd %edi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovd %xmm1, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovd %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <4 x i32> undef, i32 %a1, i32 0
%2 = load i32, i32 *%a2
%3 = insertelement <4 x i32> undef, i32 %2, i32 0
@@ -1865,6 +2114,16 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
; BTVER2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.17]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movd_64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovq %xmm1, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovq %xmm0, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <2 x i64> undef, i64 %a1, i64 0
%2 = load i64, i64 *%a2
%3 = insertelement <2 x i64> undef, i64 %2, i64 0
@@ -1918,6 +2177,13 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movhpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast x86_mmx* %a2 to double*
%2 = load double, double *%1, align 8
%3 = insertelement <2 x double> %a1, double %2, i32 1
@@ -1969,6 +2235,13 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movlpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast x86_mmx* %a2 to double*
%2 = load double, double *%1, align 8
%3 = insertelement <2 x double> %a1, double %2, i32 0
@@ -2010,6 +2283,11 @@ define i32 @test_movmskpd(<2 x double> %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movmskpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
ret i32 %1
}
@@ -2053,6 +2331,12 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntdqa:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <2 x i64> %a0, %a0
store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0
ret void
@@ -2094,6 +2378,12 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fadd <2 x double> %a0, %a0
store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0
ret void
@@ -2141,6 +2431,13 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq_mem:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovq %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64* %a1, align 1
%2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0
%3 = add <2 x i64> %a0, %2
@@ -2187,6 +2484,12 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
; BTVER2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq_reg:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.25]
+; ZNVER1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
%2 = add <2 x i64> %a1, %1
ret <2 x i64> %2
@@ -2234,6 +2537,13 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
; BTVER2-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsd_mem:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load double, double* %a0, align 1
%2 = fadd double %1, %1
store double %2, double *%a1, align 1
@@ -2277,6 +2587,11 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsd_reg:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 0>
ret <2 x double> %1
}
@@ -2323,6 +2638,13 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movupd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovupd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <2 x double>, <2 x double> *%a0, align 1
%2 = fadd <2 x double> %1, %1
store <2 x double> %2, <2 x double> *%a1, align 1
@@ -2365,6 +2687,12 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fmul <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fmul <2 x double> %1, %2
@@ -2407,6 +2735,12 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
; BTVER2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fmul double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fmul double %1, %2
@@ -2455,6 +2789,13 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_orpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = or <4 x i32> %1, %2
@@ -2510,6 +2851,12 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packssdw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <8 x i16> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -2562,6 +2909,12 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packsswb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = bitcast <16 x i8> %1 to <8 x i16>
%3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2614,6 +2967,12 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packuswb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = bitcast <16 x i8> %1 to <8 x i16>
%3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2662,6 +3021,12 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = add <16 x i8> %1, %2
@@ -2708,6 +3073,12 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = add <4 x i32> %1, %2
@@ -2750,6 +3121,12 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = add <2 x i64> %1, %2
@@ -2796,6 +3173,12 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2)
@@ -2843,6 +3226,12 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2)
@@ -2890,6 +3279,12 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2)
@@ -2937,6 +3332,12 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2)
@@ -2984,6 +3385,12 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = add <8 x i16> %1, %2
@@ -3032,6 +3439,13 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pand:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = and <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = and <2 x i64> %1, %2
@@ -3087,6 +3501,13 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pandn:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = xor <2 x i64> %a0, <i64 -1, i64 -1>
%2 = and <2 x i64> %a1, %1
%3 = load <2 x i64>, <2 x i64> *%a2, align 16
@@ -3136,6 +3557,12 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2)
@@ -3183,6 +3610,12 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2)
@@ -3234,6 +3667,13 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp eq <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = icmp eq <16 x i8> %a0, %2
@@ -3286,6 +3726,13 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp eq <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = icmp eq <4 x i32> %a0, %2
@@ -3338,6 +3785,13 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp eq <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = icmp eq <8 x i16> %a0, %2
@@ -3391,6 +3845,13 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp sgt <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = icmp sgt <16 x i8> %a0, %2
@@ -3444,6 +3905,13 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp sgt <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = icmp eq <4 x i32> %a0, %2
@@ -3497,6 +3965,13 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp sgt <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = icmp sgt <8 x i16> %a0, %2
@@ -3541,6 +4016,12 @@ define i16 @test_pextrw(<8 x i16> %a0) {
; BTVER2-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = extractelement <8 x i16> %a0, i32 6
ret i16 %1
}
@@ -3585,6 +4066,12 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
; BTVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <8 x i16> %a0, i16 %a1, i32 1
%2 = load i16, i16 *%a2
%3 = insertelement <8 x i16> %1, i16 %2, i32 3
@@ -3635,6 +4122,12 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaddwd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
%2 = bitcast <4 x i32> %1 to <8 x i16>
%3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -3683,6 +4176,12 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2)
@@ -3730,6 +4229,12 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxub:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2)
@@ -3777,6 +4282,12 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2)
@@ -3824,6 +4335,12 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminub:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2)
@@ -3863,6 +4380,11 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovmskb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
ret i32 %1
}
@@ -3904,6 +4426,12 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhuw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2)
@@ -3947,6 +4475,12 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2)
@@ -3990,6 +4524,12 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmullw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = mul <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = mul <8 x i16> %1, %2
@@ -4040,6 +4580,12 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmuludq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <2 x i64> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -4090,6 +4636,13 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_por:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = or <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = or <2 x i64> %1, %2
@@ -4141,6 +4694,12 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psadbw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
%2 = bitcast <2 x i64> %1 to <16 x i8>
%3 = load <16 x i8>, <16 x i8> *%a2, align 16
@@ -4193,6 +4752,13 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50]
+; ZNVER1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = load <4 x i32>, <4 x i32> *%a1, align 16
%3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -4244,6 +4810,13 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
; BTVER2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufhw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [8:0.50]
+; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.25]
+; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6>
%2 = load <8 x i16>, <8 x i16> *%a1, align 16
%3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4>
@@ -4295,6 +4868,13 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
; BTVER2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshuflw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [8:0.50]
+; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.25]
+; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
%2 = load <8 x i16>, <8 x i16> *%a1, align 16
%3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -4344,6 +4924,13 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pslld:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2)
@@ -4389,6 +4976,11 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pslldq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i32> %1
}
@@ -4435,6 +5027,13 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2)
@@ -4486,6 +5085,13 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2)
@@ -4537,6 +5143,13 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrad:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2)
@@ -4588,6 +5201,13 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psraw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2)
@@ -4639,6 +5259,13 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrld:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2)
@@ -4684,6 +5311,11 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrldq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
ret <4 x i32> %1
}
@@ -4730,6 +5362,13 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2)
@@ -4781,6 +5420,13 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2)
@@ -4830,6 +5476,12 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = sub <16 x i8> %1, %2
@@ -4876,6 +5528,12 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = sub <4 x i32> %1, %2
@@ -4918,6 +5576,12 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = sub <2 x i64> %1, %2
@@ -4964,6 +5628,12 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2)
@@ -5011,6 +5681,12 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2)
@@ -5058,6 +5734,12 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusb:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2)
@@ -5105,6 +5787,12 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2)
@@ -5152,6 +5840,12 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = sub <8 x i16> %1, %2
@@ -5198,6 +5892,12 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhbw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -5248,6 +5948,13 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhdq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -5297,6 +6004,13 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhqdq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 1, i32 3>
@@ -5344,6 +6058,12 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhwd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -5390,6 +6110,12 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklbw:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -5440,6 +6166,13 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckldq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -5489,6 +6222,13 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklqdq:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 0, i32 2>
@@ -5536,6 +6276,12 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklwd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -5584,6 +6330,13 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pxor:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = xor <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = xor <2 x i64> %1, %2
@@ -5633,6 +6386,13 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; BTVER2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shufpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 2>
@@ -5683,6 +6443,13 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:21.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [20:1.00]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2)
@@ -5741,6 +6508,14 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vmovapd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
+; ZNVER1-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
@@ -5785,6 +6560,12 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fsub <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fsub <2 x double> %1, %2
@@ -5827,6 +6608,12 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
; BTVER2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subsd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fsub double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fsub double %1, %2
@@ -5917,6 +6704,20 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50]
; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ucomisd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT: vucomisd (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT: sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 8
%3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -5967,6 +6768,13 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpckhpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
@@ -6022,6 +6830,13 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpcklpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
@@ -6071,6 +6886,13 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xorpd:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, %2