diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2018-01-06 21:34:26 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2018-01-06 21:34:26 +0000 |
commit | d215fd3b74b90f5dc1964610926fcc2a20f959aa (patch) | |
tree | 0c9f21e40eae033d6760008729f37d2103e2c654 /test/CodeGen/X86 | |
parent | b8a2042aa938069e862750553db0e4d82d25822c (diff) | |
download | src-test-d215fd3b74b90f5dc1964610926fcc2a20f959aa.tar.gz src-test-d215fd3b74b90f5dc1964610926fcc2a20f959aa.zip |
Notes
Diffstat (limited to 'test/CodeGen/X86')
51 files changed, 2036 insertions, 2165 deletions
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 297922809ea7e..9069755ad1319 100644 --- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -2364,16 +2364,16 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { ; X32: # %bb.0: ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovd %eax, %xmm0 -; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set1_epi16: ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0 diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index da547397c6ce8..9cd05a353fbf5 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -15,8 +15,8 @@ entry: define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcB: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq entry: diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll index 5a9f23007d86f..e2dc40c6f0ef1 100644 --- a/test/CodeGen/X86/avx-vbroadcast.ll +++ b/test/CodeGen/X86/avx-vbroadcast.ll @@ -874,39 +874,33 @@ define float @broadcast_lifetime() nounwind { ; X32-LABEL: broadcast_lifetime: ; X32: ## %bb.0: ; X32-NEXT: pushl %esi -; X32-NEXT: subl $56, %esp +; X32-NEXT: subl $40, %esp ; X32-NEXT: leal {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill +; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## 4-byte Spill ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vpermilps $0, {{[0-9]+}}(%esp), %xmm1 ## 16-byte Folded Reload -; X32-NEXT: ## xmm1 = mem[0,0,0,0] -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; X32-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 4-byte Folded Reload ; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: flds {{[0-9]+}}(%esp) -; X32-NEXT: addl $56, %esp +; X32-NEXT: addl $40, %esp ; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: broadcast_lifetime: ; X64: ## %bb.0: ; X64-NEXT: subq $40, %rsp -; X64-NEXT: movq %rsp, %rdi +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill -; X64-NEXT: movq %rsp, %rdi +; X64-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) ## 4-byte Spill +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: vpermilps $0, {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload -; X64-NEXT: ## xmm1 = mem[0,0,0,0] -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; X64-NEXT: vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload ; X64-NEXT: addq $40, %rsp ; X64-NEXT: retq %1 = alloca <4 x float>, align 16 diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 821c65bef06a8..e3cf2181387fb 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -410,3 +410,27 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) { %res1 = select i1 %cond, i32 %res, i32 0 ret i32 %res1 } + +define <1 x i1> @test13(<1 x i1>* %foo) { +; KNL-LABEL: test13: +; KNL: ## %bb.0: +; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: ## kill: def %al killed %al killed %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test13: +; SKX: ## %bb.0: +; SKX-NEXT: kmovb (%rdi), %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: ## kill: def %al killed %al killed %eax +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test13: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: movzbl (%eax), %eax +; KNL_X32-NEXT: ## kill: def %al killed %al killed %eax +; KNL_X32-NEXT: retl + %bar = load <1 x i1>, <1 x i1>* %foo + ret <1 x i1> %bar +} diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index 18e9f306bc1bf..a95e22a048563 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -2018,3 +2018,329 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 } + +define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttsd2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; NOVLDQ-NEXT: vcvttsd2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOVLDQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f64toub: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2udq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f64toub: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptoui <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f64toub: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; NOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVL-NEXT: vpsrad $31, %xmm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f64toub: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f64toub: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f64toub: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpslld $31, %ymm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttss2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; NOVLDQ-NEXT: vcvttss2usi %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; NOVLDQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f32toub: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f32toub: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsraq $63, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptoui <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f32toub: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 +; NOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVL-NEXT: vpsrad $31, %xmm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f32toub: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f32toub: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f32toub: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; VL-NEXT: vpslld $31, %ymm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptoui <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) { +; ALL-LABEL: test_16f32toub: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpslld $31, %zmm0, %zmm0 +; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; ALL-NEXT: retq + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttsd2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; NOVLDQ-NEXT: vcvttsd2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f64tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VL-NEXT: vpslld $31, %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f64tosb: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptosi <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f64tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f64tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f64tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f64tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttss2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm2 +; NOVLDQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; NOVLDQ-NEXT: vcvttss2si %xmm0, %rax +; NOVLDQ-NEXT: vmovq %rax, %xmm0 +; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVLDQ-NEXT: retq +; +; VL-LABEL: test_2f32tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VL-NEXT: retq +; +; AVX512DQ-LABEL: test_2f32tosb: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %mask = fptosi <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) { +; NOVL-LABEL: test_4f32tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0 +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: test_4f32tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) { +; NOVL-LABEL: test_8f32tosb: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVL-NEXT: retq +; +; VL-LABEL: test_8f32tosb: +; VL: # %bb.0: +; VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VL-NEXT: retq + %mask = fptosi <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) { +; ALL-LABEL: test_16f32tosb: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; ALL-NEXT: retq + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 8c79415912179..a966235df2160 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -345,9 +345,8 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -369,9 +368,8 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovsxbd (%rdi), %ymm1 -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovsxbd (%rdi), %ymm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -702,9 +700,8 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -726,9 +723,8 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpmovsxwd (%rdi), %ymm1 -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovsxwd (%rdi), %ymm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; @@ -760,9 +756,8 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL-NEXT: retq ; diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll index e1ed8ea98a1c3..b49e2ceca0bff 100644 --- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -6,7 +6,7 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -22,7 +22,6 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -37,7 +36,7 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $6, %k0, %k0 +; AVX512-NEXT: kshiftrb $6, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -53,7 +52,6 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -83,7 +81,6 @@ define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x do ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -113,7 +110,6 @@ define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x flo ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -143,7 +139,6 @@ define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -173,7 +168,6 @@ define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -203,7 +197,6 @@ define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -233,7 +226,6 @@ define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -264,7 +256,6 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -295,7 +286,6 @@ define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -325,7 +315,6 @@ define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -358,7 +347,6 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -389,7 +377,6 @@ define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -419,7 +406,6 @@ define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -450,7 +436,6 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -481,7 +466,6 @@ define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) @@ -512,7 +496,6 @@ define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) @@ -542,7 +525,6 @@ define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) @@ -575,7 +557,6 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -608,7 +589,6 @@ define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) @@ -624,7 +604,7 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: kshiftrb $1, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -645,7 +625,7 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: kshiftrb $1, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -666,7 +646,7 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k0 +; AVX512-NEXT: kshiftrb $2, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -687,7 +667,7 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k0 +; AVX512-NEXT: kshiftrb $2, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -708,7 +688,7 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $3, %k0, %k0 +; AVX512-NEXT: kshiftrb $3, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -729,7 +709,7 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -750,7 +730,7 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kshiftrb $4, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -765,7 +745,6 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -779,7 +758,7 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $7, %k0, %k0 +; AVX512-NEXT: kshiftrb $7, %k0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; @@ -800,7 +779,7 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store: ; AVX512: # %bb.0: ; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrw $6, %k0, %k0 +; AVX512-NEXT: kshiftrb $6, %k0, %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -815,7 +794,6 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -863,7 +841,6 @@ define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -891,7 +868,6 @@ define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -939,7 +915,6 @@ define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -967,7 +942,6 @@ define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1015,7 +989,6 @@ define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1043,7 +1016,6 @@ define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1072,7 +1044,6 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1121,7 +1092,6 @@ define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1149,7 +1119,6 @@ define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1180,7 +1149,6 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1229,7 +1197,6 @@ define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1257,7 +1224,6 @@ define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1286,7 +1252,6 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1315,7 +1280,6 @@ define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -1363,7 +1327,6 @@ define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1391,7 +1354,6 @@ define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1422,7 +1384,6 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1453,7 +1414,6 @@ define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 7477e05f0c7f9..9e11c799e1798 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -805,7 +805,6 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 0601c011e2903..333efb04913d3 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -3,26 +3,28 @@ ; FIXME: All cases here should be fixed by PR34380 -define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { -; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15] -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> ret <8 x i16> %res } -define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15] -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 @@ -35,13 +37,14 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i ret <8 x i16> %res } -define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: -; CHECK: # %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15] -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,6,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 @@ -55,14 +58,14 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -74,14 +77,14 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: -; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 6bee0de181abc..f6cb093d521b3 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -136,7 +136,7 @@ define <4 x i1> @test9(<8 x i1> %a, <8 x i1> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 ; CHECK-NEXT: vpmovw2m %xmm0, %k0 -; CHECK-NEXT: kshiftrw $4, %k0, %k0 +; CHECK-NEXT: kshiftrb $4, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i1> %a, <8 x i1> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -148,7 +148,7 @@ define <2 x i1> @test10(<4 x i1> %a, <4 x i1> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 -; CHECK-NEXT: kshiftrw $2, %k0, %k0 +; CHECK-NEXT: kshiftrb $2, %k0, %k0 ; CHECK-NEXT: vpmovm2q %k0, %xmm0 ; CHECK-NEXT: retq %res = shufflevector <4 x i1> %a, <4 x i1> %b, <2 x i32> <i32 2, i32 3> diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 826a4538f3f1c..6f0f873c2f70f 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -195,14 +195,12 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind { ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rsp) ; KNL-NEXT: movl (%rsp), %eax @@ -235,28 +233,24 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind { ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rsp) ; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: movl (%rsp), %ecx diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index fdd6f7126457d..8c13d4b842fcb 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -871,23 +871,14 @@ define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm_mask_broadcastd_epi32: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastd_epi32: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -903,23 +894,14 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_maskz_broadcastd_epi32: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastd_epi32: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1007,23 +989,14 @@ define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm_mask_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1036,23 +1009,14 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_maskz_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i2 @@ -1079,23 +1043,14 @@ define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm256_mask_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1108,23 +1063,14 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastq_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_broadcastq_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1151,23 +1097,14 @@ define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { ; X32-LABEL: test_mm_mask_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1180,23 +1117,14 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { ; X32-LABEL: test_mm_maskz_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i2 @@ -1223,23 +1151,14 @@ define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) { ; X32-LABEL: test_mm256_mask_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1252,23 +1171,14 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastsd_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_broadcastsd_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1295,23 +1205,14 @@ define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_mask_broadcastss_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastss_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1324,23 +1225,14 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm_maskz_broadcastss_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastss_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1419,23 +1311,14 @@ define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) { ; X32-LABEL: test_mm_mask_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1448,23 +1331,14 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) { ; X32-LABEL: test_mm_maskz_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i2 @@ -1491,23 +1365,14 @@ define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) { define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { ; X32-LABEL: test_mm256_mask_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1520,23 +1385,14 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) { ; X32-LABEL: test_mm256_maskz_movddup_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_movddup_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -1563,23 +1419,14 @@ define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) { define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_mask_movehdup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_movehdup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1592,23 +1439,14 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm_maskz_movehdup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_movehdup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1687,23 +1525,14 @@ define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) { define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_mask_moveldup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_moveldup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1716,23 +1545,14 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm_maskz_moveldup_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_moveldup_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 @@ -1811,23 +1631,14 @@ define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) { ; X32-LABEL: test_mm256_mask_permutex_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_permutex_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1840,23 +1651,14 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_permutex_epi64: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_permutex_epi64: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -1883,23 +1685,14 @@ define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) { ; X32-LABEL: test_mm256_mask_permutex_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_permutex_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -1912,23 +1705,14 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) { ; X32-LABEL: test_mm256_maskz_permutex_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_permutex_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -1955,23 +1739,14 @@ define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) { ; X32-LABEL: test_mm_mask_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i2 @@ -1984,23 +1759,14 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) { ; X32-LABEL: test_mm_maskz_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $3, %al -; X32-NEXT: movb %al, {{[0-9]+}}(%esp) -; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $3, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i2 @@ -2027,23 +1793,14 @@ define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) { define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) { ; X32-LABEL: test_mm256_mask_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -2056,23 +1813,14 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) { ; X32-LABEL: test_mm256_maskz_shuffle_pd: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_shuffle_pd: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] ; X64-NEXT: retq %trn1 = trunc i8 %a0 to i4 @@ -2099,23 +1847,14 @@ define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) { define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) { ; X32-LABEL: test_mm_mask_shuffle_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_shuffle_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] ; X64-NEXT: retq %trn1 = trunc i8 %a1 to i4 @@ -2128,23 +1867,14 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) { ; X32-LABEL: test_mm_maskz_shuffle_ps: ; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: andb $15, %al -; X32-NEXT: movb %al, (%esp) -; X32-NEXT: movzbl (%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] -; X32-NEXT: popl %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_shuffle_ps: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] ; X64-NEXT: retq %trn0 = trunc i8 %a0 to i4 diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 5f4b050b863d1..ea1ff4e56b959 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -22,7 +22,6 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -59,7 +58,6 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -96,7 +94,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -104,7 +101,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -142,7 +138,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -150,7 +145,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -193,7 +187,6 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -235,7 +228,6 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -275,7 +267,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -285,7 +276,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -326,7 +316,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -336,7 +325,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -380,11 +368,9 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -427,11 +413,9 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -587,7 +571,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -614,7 +597,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -643,7 +625,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -674,7 +655,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -712,7 +692,6 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -776,7 +755,6 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -842,7 +820,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -910,7 +887,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -979,7 +955,6 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1048,7 +1023,6 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1119,7 +1093,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -1192,7 +1165,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -1271,7 +1243,6 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1311,7 +1282,6 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1349,7 +1319,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1357,7 +1326,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1396,7 +1364,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1404,7 +1371,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -1450,7 +1416,6 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1495,7 +1460,6 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1536,7 +1500,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1546,7 +1509,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1588,7 +1550,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -1598,7 +1559,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -1814,13 +1774,11 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -1949,14 +1907,12 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -3117,11 +3073,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -3158,11 +3112,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -3214,7 +3166,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3272,7 +3223,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3319,11 +3269,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -3377,7 +3325,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3427,11 +3374,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -3474,11 +3419,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -3536,7 +3479,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3600,7 +3542,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -3653,11 +3594,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -3717,7 +3656,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -4802,7 +4740,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4842,7 +4779,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4885,7 +4821,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4929,7 +4864,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -4973,7 +4907,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -5017,7 +4950,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -5064,7 +4996,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5109,7 +5040,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5157,7 +5087,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5206,7 +5135,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5255,7 +5183,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5304,7 +5231,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -5333,8 +5259,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: @@ -5361,8 +5286,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* % ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: @@ -5391,8 +5315,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: @@ -5430,8 +5353,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: @@ -5470,8 +5392,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: @@ -5502,8 +5423,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: @@ -6090,11 +6010,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -6131,11 +6049,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -6181,7 +6097,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6233,7 +6148,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6280,11 +6194,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -6332,7 +6244,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6382,11 +6293,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -6429,11 +6338,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -6485,7 +6392,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6543,7 +6449,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6596,11 +6501,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -6654,7 +6557,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7476,7 +7378,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7519,7 +7420,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7577,7 +7477,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7637,7 +7536,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7686,7 +7584,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7746,7 +7643,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7798,7 +7694,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7847,7 +7742,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7911,7 +7805,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7977,7 +7870,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -8032,7 +7924,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -8098,7 +7989,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -9132,7 +9022,6 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9169,7 +9058,6 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9206,7 +9094,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9214,7 +9101,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9252,7 +9138,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9260,7 +9145,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -9303,7 +9187,6 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9345,7 +9228,6 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9385,7 +9267,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9395,7 +9276,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9436,7 +9316,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -9446,7 +9325,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -9490,11 +9368,9 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -9537,11 +9413,9 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -9697,7 +9571,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -9724,7 +9597,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -9753,7 +9625,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -9784,7 +9655,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -9822,7 +9692,6 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -9886,7 +9755,6 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -9952,7 +9820,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10020,7 +9887,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10089,7 +9955,6 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -10158,7 +10023,6 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -10229,7 +10093,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10302,7 +10165,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -10381,7 +10243,6 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10421,7 +10282,6 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10459,7 +10319,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10467,7 +10326,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10506,7 +10364,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10514,7 +10371,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -10560,7 +10416,6 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10605,7 +10460,6 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10646,7 +10500,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10656,7 +10509,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10698,7 +10550,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -10708,7 +10559,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -10924,13 +10774,11 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -11059,14 +10907,12 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -12227,11 +12073,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -12268,11 +12112,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -12324,7 +12166,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12382,7 +12223,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12429,11 +12269,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -12487,7 +12325,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12537,11 +12374,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -12584,11 +12419,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -12646,7 +12479,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12710,7 +12542,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12763,11 +12594,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -12827,7 +12656,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -13912,7 +13740,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -13952,7 +13779,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -13995,7 +13821,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14039,7 +13864,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14083,7 +13907,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14127,7 +13950,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -14174,7 +13996,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14219,7 +14040,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14267,7 +14087,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14316,7 +14135,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14365,7 +14183,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14414,7 +14231,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -14443,8 +14259,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: @@ -14471,8 +14286,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: @@ -14501,8 +14315,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: @@ -14540,8 +14353,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: @@ -14580,8 +14392,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: @@ -14612,8 +14423,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: @@ -15200,11 +15010,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -15241,11 +15049,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -15291,7 +15097,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15343,7 +15148,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15390,11 +15194,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -15442,7 +15244,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15492,11 +15293,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -15539,11 +15338,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -15595,7 +15392,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15653,7 +15449,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15706,11 +15501,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -15764,7 +15557,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16586,7 +16378,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16629,7 +16420,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16687,7 +16477,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16747,7 +16536,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16796,7 +16584,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16856,7 +16643,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16908,7 +16694,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -16957,7 +16742,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17021,7 +16805,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17087,7 +16870,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17142,7 +16924,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -17208,7 +16989,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -18331,7 +18111,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -18380,7 +18159,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -18522,7 +18300,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -18576,7 +18353,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -19644,7 +19420,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -19694,7 +19469,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -19843,7 +19617,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -19898,7 +19671,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -21451,7 +21223,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21495,7 +21266,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21551,7 +21321,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21610,7 +21379,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21659,7 +21427,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21717,7 +21484,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21769,7 +21535,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21819,7 +21584,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21881,7 +21645,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -21946,7 +21709,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -22001,7 +21763,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -22065,7 +21826,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -23150,7 +22910,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23190,7 +22949,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23233,7 +22991,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23277,7 +23034,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23321,7 +23077,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23365,7 +23120,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -23412,7 +23166,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23457,7 +23210,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23505,7 +23257,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23554,7 +23305,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23603,7 +23353,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23652,7 +23401,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -23681,8 +23429,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: @@ -23711,8 +23458,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: @@ -23744,8 +23490,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: @@ -23783,8 +23528,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: @@ -23824,8 +23568,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: @@ -23858,8 +23601,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: @@ -24464,7 +24206,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24508,7 +24249,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24558,7 +24298,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24611,7 +24350,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24660,7 +24398,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24712,7 +24449,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24764,7 +24500,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24814,7 +24549,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24870,7 +24604,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24929,7 +24662,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24984,7 +24716,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25042,7 +24773,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25894,7 +25624,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25940,7 +25669,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26000,7 +25728,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26063,7 +25790,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26114,7 +25840,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26176,7 +25901,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26230,7 +25954,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26282,7 +26005,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26348,7 +26070,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26417,7 +26138,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26474,7 +26194,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26542,7 +26261,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27579,7 +27297,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27619,7 +27336,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27659,7 +27375,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27667,7 +27382,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27708,7 +27422,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27716,7 +27429,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -27762,7 +27474,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27807,7 +27518,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27850,7 +27560,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27860,7 +27569,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27904,7 +27612,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -27914,7 +27621,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -27961,11 +27667,9 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -28011,11 +27715,9 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -28180,7 +27882,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -28210,7 +27911,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -28242,7 +27942,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -28276,7 +27975,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -28317,7 +28015,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28384,7 +28081,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28453,7 +28149,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28524,7 +28219,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28596,7 +28290,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28668,7 +28361,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -28742,7 +28434,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28818,7 +28509,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftrw $7, %k0, %k1 @@ -28900,7 +28590,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -28943,7 +28632,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -28984,7 +28672,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -28992,7 +28679,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -29034,7 +28720,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -29042,7 +28727,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -29091,7 +28775,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29139,7 +28822,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29183,7 +28865,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -29193,7 +28874,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29238,7 +28918,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 @@ -29248,7 +28927,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -29466,7 +29144,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 ; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 @@ -29475,7 +29152,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -29607,7 +29283,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -29616,7 +29291,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %ecx @@ -30826,11 +30500,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -30870,11 +30542,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -30929,7 +30599,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -30990,7 +30659,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31040,11 +30708,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -31101,7 +30767,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31154,11 +30819,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -31204,11 +30867,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -31269,7 +30930,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31336,7 +30996,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31392,11 +31051,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -31459,7 +31116,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -32544,7 +32200,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32584,7 +32239,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32627,7 +32281,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32671,7 +32324,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32715,7 +32367,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32759,7 +32410,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -32806,7 +32456,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32851,7 +32500,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32899,7 +32547,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32948,7 +32595,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -32997,7 +32643,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -33046,7 +32691,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -33075,8 +32719,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: @@ -33106,8 +32749,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: @@ -33139,8 +32781,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: @@ -33181,8 +32822,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: @@ -33224,8 +32864,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: @@ -33259,8 +32898,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, < ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: @@ -33889,11 +33527,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -33933,11 +33569,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -33986,7 +33620,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34041,7 +33674,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34091,11 +33723,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -34146,7 +33776,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34199,11 +33828,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -34249,11 +33876,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -34308,7 +33933,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34369,7 +33993,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -34425,11 +34048,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -34486,7 +34107,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35347,7 +34967,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35393,7 +35012,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35454,7 +35072,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35517,7 +35134,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35569,7 +35185,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35632,7 +35247,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35687,7 +35301,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35739,7 +35352,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35806,7 +35418,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35875,7 +35486,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -35933,7 +35543,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -36002,7 +35611,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37177,8 +36785,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -37186,9 +36793,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i6 ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -37238,8 +36843,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -37247,9 +36851,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -37300,8 +36902,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -37309,9 +36910,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -37523,8 +37122,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -37532,9 +37130,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -37584,8 +37180,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -37593,9 +37188,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -37646,8 +37239,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -37655,9 +37247,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, ; ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -37729,11 +37319,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -37770,11 +37358,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -37813,11 +37399,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -37839,8 +37423,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -37853,24 +37436,21 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -37889,8 +37469,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -37903,24 +37482,21 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -37940,8 +37516,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -37954,10 +37529,8 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -37965,14 +37538,13 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -38015,11 +37587,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -38062,11 +37632,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -38111,11 +37679,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -38140,8 +37706,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -38154,10 +37719,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -38167,16 +37730,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -38196,8 +37758,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -38210,10 +37771,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -38223,16 +37782,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -38253,8 +37811,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -38267,10 +37824,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2 ; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 @@ -38281,16 +37836,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -39366,7 +38920,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39406,7 +38959,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39447,7 +38999,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39491,7 +39042,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39535,7 +39085,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39580,7 +39129,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -39669,7 +39217,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39714,7 +39261,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39760,7 +39306,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39809,7 +39354,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39858,7 +39402,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39908,7 +39451,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -39991,8 +39533,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: @@ -40019,8 +39560,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: @@ -40048,8 +39588,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* % ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: @@ -40078,18 +39617,14 @@ entry: define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -40115,18 +39650,14 @@ entry: define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -40153,18 +39684,14 @@ entry: define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: kmovb %k0, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40310,8 +39837,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -40319,9 +39845,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i6 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -40357,8 +39881,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -40366,9 +39889,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -40405,8 +39926,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -40414,9 +39934,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40572,8 +40090,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -40581,9 +40098,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -40619,8 +40134,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -40628,9 +40142,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -40667,8 +40179,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -40676,9 +40187,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40736,11 +40245,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -40777,11 +40284,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -40820,11 +40325,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax @@ -40846,8 +40349,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -40860,24 +40362,21 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -40896,8 +40395,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -40910,24 +40408,21 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -40947,8 +40442,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: retq @@ -40961,10 +40455,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -40972,14 +40464,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -41022,11 +40513,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -41069,11 +40558,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -41118,11 +40605,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -41147,8 +40632,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -41161,10 +40645,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 @@ -41174,16 +40656,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -41203,8 +40684,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -41217,10 +40697,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 @@ -41230,16 +40708,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -41260,8 +40737,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: retq @@ -41274,10 +40750,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 @@ -41288,16 +40762,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -41487,8 +40960,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -41497,9 +40969,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i6 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41550,8 +41020,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -41560,9 +41029,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41614,8 +41081,7 @@ entry: define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %al killed %al killed %eax @@ -41624,9 +41090,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -41848,8 +41312,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -41858,9 +41321,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41911,8 +41372,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -41921,9 +41381,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, < ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -41975,8 +41433,7 @@ entry: define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: # kill: def %ax killed %ax killed %eax @@ -41985,9 +41442,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry -; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -42062,7 +41517,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42105,7 +41559,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42150,7 +41603,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42176,8 +41628,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: vzeroupper @@ -42191,10 +41642,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42202,14 +41651,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -42228,8 +41676,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: vzeroupper @@ -42243,10 +41690,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42254,14 +41699,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -42281,8 +41725,7 @@ entry: define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovd %k0, %eax ; VLX-NEXT: vzeroupper @@ -42296,10 +41739,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -42308,14 +41749,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: vzeroupper @@ -42360,7 +41800,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42409,7 +41848,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42460,7 +41898,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -42489,8 +41926,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper @@ -42504,10 +41940,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42518,16 +41952,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -42547,8 +41980,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper @@ -42562,10 +41994,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 @@ -42576,16 +42006,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -42606,8 +42035,7 @@ entry: define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr { ; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; VLX: # %bb.0: # %entry -; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1 +; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1} ; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper @@ -42621,10 +42049,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $96, %rsp -; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2 ; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0 @@ -42636,16 +42062,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp @@ -43830,7 +43255,6 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) { ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index 78c44e4dca3b3..45af265a95b05 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -31,11 +31,9 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax @@ -80,8 +78,7 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i32: @@ -89,8 +86,7 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i32> %a, %b %x1 = icmp sgt <4 x i32> %c, %d @@ -123,8 +119,7 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; AVX512F-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4f32: @@ -132,8 +127,7 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; AVX512BW-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = fcmp ogt <4 x float> %a, %b %x1 = fcmp ogt <4 x float> %c, %d @@ -165,11 +159,9 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -318,8 +310,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i8: @@ -335,8 +326,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i8> %a, %b %x1 = icmp sgt <2 x i8> %c, %d @@ -473,8 +463,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i16: @@ -490,8 +479,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i16> %a, %b %x1 = icmp sgt <2 x i16> %c, %d @@ -612,8 +600,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: @@ -629,8 +616,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i32> %a, %b %x1 = icmp sgt <2 x i32> %c, %d @@ -682,8 +668,7 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: @@ -691,8 +676,7 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <2 x i64> %a, %b %x1 = icmp sgt <2 x i64> %c, %d @@ -725,8 +709,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; AVX512F-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2f64: @@ -734,8 +717,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; AVX512BW-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = fcmp ogt <2 x double> %a, %b %x1 = fcmp ogt <2 x double> %c, %d @@ -792,8 +774,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i8: @@ -809,8 +790,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i8> %a, %b %x1 = icmp sgt <4 x i8> %c, %d @@ -867,8 +847,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i16: @@ -884,8 +863,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i16> %a, %b %x1 = icmp sgt <4 x i16> %c, %d @@ -944,10 +922,8 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-NEXT: vpmovsxwd %xmm2, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll index fdce65516e322..62480bb0bd25c 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -94,8 +94,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512F-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -104,8 +103,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512BW-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x0 = icmp sgt <4 x i64> %a, %b @@ -148,8 +146,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> ; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512F-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -158,8 +155,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> ; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512BW-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x0 = fcmp ogt <4 x double> %a, %b @@ -219,11 +215,9 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index 6ef2be99dee52..04cbded7667cd 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -44,10 +44,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX512-LABEL: ext_i2_2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: andb $3, %dil -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -86,10 +83,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; ; AVX512-LABEL: ext_i4_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: andb $15, %dil -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -102,8 +96,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -112,8 +106,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-LABEL: ext_i8_8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -239,10 +233,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX512-LABEL: ext_i4_4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: andb $15, %dil -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: retq @@ -305,8 +296,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3-LABEL: ext_i16_16i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 @@ -319,8 +310,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1-LABEL: ext_i16_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -565,8 +556,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-LABEL: ext_i32_32i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 @@ -574,8 +565,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768] ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 @@ -586,8 +577,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-LABEL: ext_i32_32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 @@ -599,8 +590,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 9e77cd11449e6..54ba1881f1151 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -49,9 +49,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX512F-LABEL: ext_i2_2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: andb $3, %dil -; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512F-NEXT: vzeroupper @@ -59,10 +57,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX512VLBW-LABEL: ext_i2_2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: andb $3, %dil -; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kmovd %edi, %k1 ; AVX512VLBW-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i2 %a0 to <2 x i1> @@ -104,9 +99,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX512F-LABEL: ext_i4_4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: andb $15, %dil -; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512F-NEXT: vzeroupper @@ -114,10 +107,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; ; AVX512VLBW-LABEL: ext_i4_4i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: andb $15, %dil -; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kmovd %edi, %k1 ; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i4 %a0 to <4 x i1> @@ -129,8 +119,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -140,8 +130,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-LABEL: ext_i8_8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -300,19 +290,14 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX512F-LABEL: ext_i4_4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: andb $15, %dil -; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i4_4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: andb $15, %dil -; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kmovd %edi, %k1 ; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i4 %a0 to <4 x i1> @@ -385,8 +370,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3-LABEL: ext_i16_16i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 @@ -401,8 +386,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1-LABEL: ext_i16_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -723,8 +708,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-LABEL: ext_i32_32i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 @@ -734,8 +719,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: psrlw $15, %xmm1 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 @@ -748,8 +733,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-LABEL: ext_i32_32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 @@ -763,8 +748,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm2 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 45a48fae146d4..8af95dfd5b80e 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -43,9 +43,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; ; AVX512-LABEL: bitcast_i2_2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -86,9 +84,7 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) { ; ; AVX512-LABEL: bitcast_i4_4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: retq @@ -100,8 +96,8 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; SSE2-SSSE3-LABEL: bitcast_i8_8i1: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -111,8 +107,8 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; AVX1-LABEL: bitcast_i8_8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index 8fdacb7b79d61..a96c1a30e67a5 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -27,7 +27,6 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax @@ -64,16 +63,14 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i32> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -99,16 +96,14 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = fcmp ogt <4 x float> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -134,7 +129,6 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -226,8 +220,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i8: @@ -238,8 +231,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i8> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -320,8 +312,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i16: @@ -332,8 +323,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i16> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -406,8 +396,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i32: @@ -418,8 +407,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i32> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -455,16 +443,14 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <2 x i64> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -490,16 +476,14 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2f64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = fcmp ogt <2 x double> %a, %b %res = bitcast <2 x i1> %x to i2 @@ -537,8 +521,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i8: @@ -549,8 +532,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i8> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -588,8 +570,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v4i16: @@ -600,8 +581,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i16> %a, %b %res = bitcast <4 x i1> %x to i4 @@ -641,7 +621,6 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll index 48e28c9d26ca4..0398f31f12d48 100644 --- a/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-setcc-256.ll @@ -42,7 +42,6 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -194,11 +193,9 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rsp) ; AVX512F-NEXT: movl (%rsp), %eax @@ -271,8 +268,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -280,8 +276,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x = icmp sgt <4 x i64> %a, %b @@ -311,8 +306,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -320,8 +314,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX512BW-NEXT: # kill: def %al killed %al killed %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %x = fcmp ogt <4 x double> %a, %b diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll index 9914f0b934341..f752068acdf00 100644 --- a/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/test/CodeGen/X86/bitcast-setcc-512.ll @@ -62,14 +62,12 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rsp) ; AVX512F-NEXT: movl (%rsp), %eax @@ -870,21 +868,17 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rsp) ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: movl (%rsp), %ecx diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll index 8548d8b7677d2..428eaa19497b6 100644 --- a/test/CodeGen/X86/broadcastm-lowering.ll +++ b/test/CodeGen/X86/broadcastm-lowering.ll @@ -8,7 +8,6 @@ define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) { ; AVX512CD: # %bb.0: # %entry ; AVX512CD-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512CD-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512CD-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512CD-NEXT: kmovw %k0, %eax ; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -45,7 +44,6 @@ define <4 x i32> @test_mm_epi32(<16 x i8> %a, <16 x i8> %b) { ; AVX512CD: # %bb.0: # %entry ; AVX512CD-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512CD-NEXT: kmovw %k0, %eax ; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -179,7 +177,6 @@ define <8 x i32> @test_mm256_epi32(<16 x i16> %a, <16 x i16> %b) { ; AVX512CD: # %bb.0: # %entry ; AVX512CD-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512CD-NEXT: kmovw %k0, %eax ; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/build-vector-128.ll b/test/CodeGen/X86/build-vector-128.ll index da92fe6c3fdae..6c0c2d30c312d 100644 --- a/test/CodeGen/X86/build-vector-128.ll +++ b/test/CodeGen/X86/build-vector-128.ll @@ -409,3 +409,101 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15 ret <16 x i8> %ins15 } + +; PR30780 + +define <4 x i32> @test_buildvector_v4i32_splat_sext_i8(i8 %in) { +; SSE-32-LABEL: test_buildvector_v4i32_splat_sext_i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: test_buildvector_v4i32_splat_sext_i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movsbl %dil, %eax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-64-NEXT: retq +; +; AVX1-32-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movsbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v4i32_splat_sext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movsbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-64-NEXT: retq + %ext = sext i8 %in to i32 + %insert = insertelement <4 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %splat +} + +define <4 x i32> @test_buildvector_v4i32_splat_zext_i8(i8 %in) { +; SSE-32-LABEL: test_buildvector_v4i32_splat_zext_i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: test_buildvector_v4i32_splat_zext_i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movzbl %dil, %eax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-64-NEXT: retq +; +; AVX1-32-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movzbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v4i32_splat_zext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movzbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-64-NEXT: retq + %ext = zext i8 %in to i32 + %insert = insertelement <4 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %splat +} diff --git a/test/CodeGen/X86/build-vector-256.ll b/test/CodeGen/X86/build-vector-256.ll index f2f17710033d7..d2d7a194c7012 100644 --- a/test/CodeGen/X86/build-vector-256.ll +++ b/test/CodeGen/X86/build-vector-256.ll @@ -411,3 +411,77 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %ins31 = insertelement <32 x i8> %ins30, i8 %a31, i32 31 ret <32 x i8> %ins31 } + +; PR30780 + +define <8 x i32> @test_buildvector_v8i32_splat_sext_i8(i8 %in) { +; AVX1-32-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movsbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v8i32_splat_sext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movsbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-64-NEXT: retq + %ext = sext i8 %in to i32 + %insert = insertelement <8 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer + ret <8 x i32> %splat +} + +define <8 x i32> @test_buildvector_v8i32_splat_zext_i8(i8 %in) { +; AVX1-32-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: vmovd %eax, %xmm0 +; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-32-NEXT: retl +; +; AVX1-64-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: movzbl %dil, %eax +; AVX1-64-NEXT: vmovd %eax, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: vmovd %eax, %xmm0 +; AVX2-32-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v8i32_splat_zext_i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: movzbl %dil, %eax +; AVX2-64-NEXT: vmovd %eax, %xmm0 +; AVX2-64-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-64-NEXT: retq + %ext = zext i8 %in to i32 + %insert = insertelement <8 x i32> undef, i32 %ext, i32 0 + %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer + ret <8 x i32> %splat +} diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll index ee63ec6539188..ff41083835f4f 100644 --- a/test/CodeGen/X86/cast-vsel.ll +++ b/test/CodeGen/X86/cast-vsel.ll @@ -409,11 +409,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE2-LABEL: example24: ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB6_1: # %vector.body @@ -441,11 +441,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE41-LABEL: example24: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: .LBB6_1: # %vector.body @@ -470,11 +470,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; AVX1-LABEL: example24: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll index 556c858759fc7..afb9aa9411ed6 100644 --- a/test/CodeGen/X86/cvtv2f32.ll +++ b/test/CodeGen/X86/cvtv2f32.ll @@ -5,8 +5,8 @@ ; uitofp <2 x i32> codegen from buildvector or legalization is different but gives the same results ; across the full 0 - 0xFFFFFFFF u32 range. -define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) { -; X32-LABEL: uitofp_2i32_buildvector: +define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v) { +; X32-LABEL: uitofp_2i32_cvt_buildvector: ; X32: # %bb.0: ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] @@ -18,7 +18,7 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) { ; X32-NEXT: mulps %xmm1, %xmm0 ; X32-NEXT: retl ; -; X64-LABEL: uitofp_2i32_buildvector: +; X64-LABEL: uitofp_2i32_cvt_buildvector: ; X64: # %bb.0: ; X64-NEXT: movd %edi, %xmm1 ; X64-NEXT: pinsrd $1, %esi, %xmm1 @@ -38,6 +38,37 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) { ret <2 x float> %t5 } +define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) { +; X32-LABEL: uitofp_2i32_buildvector_cvt: +; X32: # %bb.0: +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X32-NEXT: movapd {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] +; X32-NEXT: orpd %xmm1, %xmm2 +; X32-NEXT: subpd %xmm1, %xmm2 +; X32-NEXT: cvtpd2ps %xmm2, %xmm1 +; X32-NEXT: mulps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: uitofp_2i32_buildvector_cvt: +; X64: # %bb.0: +; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %edi, %xmm2 +; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15] +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: subpd %xmm1, %xmm2 +; X64-NEXT: cvtpd2ps %xmm2, %xmm1 +; X64-NEXT: mulps %xmm1, %xmm0 +; X64-NEXT: retq + %t1 = insertelement <2 x i32> undef, i32 %x, i32 0 + %t2 = insertelement <2 x i32> %t1, i32 %y, i32 1 + %t3 = uitofp <2 x i32> %t2 to <2 x float> + %t4 = fmul <2 x float> %v, %t3 + ret <2 x float> %t4 +} + define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) { ; X32-LABEL: uitofp_2i32_legalized: ; X32: # %bb.0: diff --git a/test/CodeGen/X86/fixup-bw-inst.mir b/test/CodeGen/X86/fixup-bw-inst.mir index cea483e1b9bc6..e5a5e16108fb5 100644 --- a/test/CodeGen/X86/fixup-bw-inst.mir +++ b/test/CodeGen/X86/fixup-bw-inst.mir @@ -26,6 +26,12 @@ ret i16 %i.0 } + define i16 @test4() { + entry: + %t1 = zext i1 undef to i16 + %t2 = or i16 undef, %t1 + ret i16 %t2 + } ... --- # CHECK-LABEL: name: test1 @@ -149,3 +155,47 @@ body: | RETQ %ax ... +--- +# CHECK-LABEL: name: test4 +name: test4 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%r9d' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +# This code copies r10b into r9b and then uses r9w. We would like to promote +# the copy to a 32-bit copy, but because r9w is used this is not acceptable. +body: | + bb.0.entry: + successors: + liveins: %r9d + + %r9b = MOV8rr undef %r10b, implicit-def %r9d, implicit killed %r9d, implicit-def %eflags + ; CHECK-NOT: MOV32rr + %ax = OR16rr undef %ax, %r9w, implicit-def %eflags + RETQ %ax +... diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll index 1c97e8c768ccd..cc434bf18ab31 100644 --- a/test/CodeGen/X86/memset-nonzero.ll +++ b/test/CodeGen/X86/memset-nonzero.ll @@ -206,8 +206,8 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; @@ -245,8 +245,8 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) ; SSE2FAST-NEXT: retq @@ -292,8 +292,8 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) @@ -351,8 +351,8 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) @@ -400,8 +400,8 @@ define void @memset_256_nonconst_bytes(i8* %x, i8 %c) { ; SSE2FAST: # %bb.0: ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi) diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index df97973aecbd2..50f44419e8230 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -695,7 +695,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw ; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, (%rdi) diff --git a/test/CodeGen/X86/pr33349.ll b/test/CodeGen/X86/pr33349.ll index b1428ba6667c8..8f9c861d9ecf3 100644 --- a/test/CodeGen/X86/pr33349.ll +++ b/test/CodeGen/X86/pr33349.ll @@ -40,7 +40,7 @@ target triple = "x86_64-unknown-linux-gnu" ; SKX: # %bb.0: # %bb ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 -; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: kshiftrb $2, %k0, %k1 ; SKX-NEXT: kshiftrw $1, %k1, %k2 ; SKX-NEXT: kmovd %k2, %eax ; SKX-NEXT: testb $1, %al diff --git a/test/CodeGen/X86/pr35765.ll b/test/CodeGen/X86/pr35765.ll new file mode 100644 index 0000000000000..4d097459e33ac --- /dev/null +++ b/test/CodeGen/X86/pr35765.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s + +@ll = local_unnamed_addr global i64 0, align 8 +@x = local_unnamed_addr global i64 2651237805702985558, align 8 +@s1 = local_unnamed_addr global { i8, i8 } { i8 123, i8 5 }, align 2 +@s2 = local_unnamed_addr global { i8, i8 } { i8 -122, i8 3 }, align 2 + +define void @PR35765() { +; CHECK-LABEL: PR35765: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx +; CHECK-NEXT: addl $-1398, %ecx # imm = 0xFA8A +; CHECK-NEXT: movl $4, %eax +; CHECK-NEXT: # kill: def %cl killed %cl killed %ecx +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx +; CHECK-NEXT: movzwl {{.*}}(%rip), %edx +; CHECK-NEXT: notl %edx +; CHECK-NEXT: orl $63488, %edx # imm = 0xF800 +; CHECK-NEXT: movzwl %dx, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: xorl %eax, %edx +; CHECK-NEXT: movslq %edx, %rax +; CHECK-NEXT: movq %rax, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %bf.load.i = load i16, i16* bitcast ({ i8, i8 }* @s1 to i16*), align 2 + %bf.clear.i = and i16 %bf.load.i, 2047 + %conv.i = zext i16 %bf.clear.i to i32 + %sub.i = add nsw i32 %conv.i, -1398 + %shl.i = shl i32 4, %sub.i + %0 = load i64, i64* @x, align 8 + %bf.load1.i = load i16, i16* bitcast ({ i8, i8 }* @s2 to i16*), align 2 + %bf.clear2.i = and i16 %bf.load1.i, 2047 + %1 = xor i16 %bf.clear2.i, -1 + %neg.i = zext i16 %1 to i64 + %or.i = or i64 %0, %neg.i + %conv5.i = trunc i64 %or.i to i32 + %conv6.i = and i32 %conv5.i, 65535 + %xor.i = xor i32 %conv6.i, %shl.i + %conv7.i = sext i32 %xor.i to i64 + store i64 %conv7.i, i64* @ll, align 8 + ret void +} diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll index 8642bc596f39b..490c232a161c8 100644 --- a/test/CodeGen/X86/psubus.ll +++ b/test/CodeGen/X86/psubus.ll @@ -54,16 +54,16 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind { ; SSE-LABEL: test3: ; SSE: # %bb.0: # %vector.ph ; SSE-NEXT: movd %edi, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: psubusw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: test3: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -137,8 +137,8 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind { ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: psubusb %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -267,8 +267,8 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { ; SSE-LABEL: test9: ; SSE: # %bb.0: # %vector.ph ; SSE-NEXT: movd %edi, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: psubusw %xmm2, %xmm0 ; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq @@ -277,8 +277,8 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 @@ -392,8 +392,8 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind { ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: psubusb %xmm2, %xmm0 ; SSE2-NEXT: psubusb %xmm2, %xmm1 ; SSE2-NEXT: retq diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index 410378ffbad29..1163307a0c34b 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512BW ; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization. @@ -14,14 +17,14 @@ define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; AVXANY-LABEL: ne_i128: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: setne %al +; AVXANY-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 %bcy = bitcast <2 x i64> %y to i128 %cmp = icmp ne i128 %bcx, %bcy @@ -39,14 +42,14 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVXANY-LABEL: eq_i128: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: sete %al +; AVXANY-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 %bcy = bitcast <2 x i64> %y to i128 %cmp = icmp eq i128 %bcx, %bcy @@ -80,15 +83,39 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i256: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $-1, %ecx -; AVX2-NEXT: setne %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1-LABEL: ne_i256: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovq %xmm2, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm2, %r8 +; AVX1-NEXT: vmovq %xmm1, %rdi +; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: xorq %rcx, %rsi +; AVX1-NEXT: orq %rdi, %rsi +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: xorq %rdx, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: xorq %r8, %rcx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX256-LABEL: ne_i256: +; AVX256: # %bb.0: +; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: setne %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %bcx = bitcast <4 x i64> %x to i256 %bcy = bitcast <4 x i64> %y to i256 %cmp = icmp ne i256 %bcx, %bcy @@ -122,15 +149,39 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i256: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $-1, %ecx -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1-LABEL: eq_i256: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovq %xmm2, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm2, %r8 +; AVX1-NEXT: vmovq %xmm1, %rdi +; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: xorq %rcx, %rsi +; AVX1-NEXT: orq %rdi, %rsi +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: xorq %rdx, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: xorq %r8, %rcx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX256-LABEL: eq_i256: +; AVX256: # %bb.0: +; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: sete %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %bcx = bitcast <4 x i64> %x to i256 %bcy = bitcast <4 x i64> %y to i256 %cmp = icmp eq i256 %bcx, %bcy @@ -138,43 +189,37 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ret i32 %zext } -; This test models the expansion of 'memcmp(a, b, 32) != 0' +; This test models the expansion of 'memcmp(a, b, 32) != 0' ; if we allowed 2 pairs of 16-byte loads per block. define i32 @ne_i128_pair(i128* %a, i128* %b) { ; SSE2-LABEL: ne_i128_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %rcx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 8(%rsi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq 16(%rdi), %rdi -; SSE2-NEXT: xorq 16(%rsi), %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm2 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i128_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: xorq (%rsi), %rax -; AVX2-NEXT: xorq 8(%rsi), %rcx -; AVX2-NEXT: movq 24(%rdi), %rdx -; AVX2-NEXT: movq 16(%rdi), %rdi -; AVX2-NEXT: xorq 16(%rsi), %rdi -; AVX2-NEXT: orq %rax, %rdi -; AVX2-NEXT: xorq 24(%rsi), %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; AVXANY-LABEL: ne_i128_pair: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 +; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: setne %al +; AVXANY-NEXT: retq %a0 = load i128, i128* %a %b0 = load i128, i128* %b %xor1 = xor i128 %a0, %b0 @@ -189,43 +234,37 @@ define i32 @ne_i128_pair(i128* %a, i128* %b) { ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 32) == 0' +; This test models the expansion of 'memcmp(a, b, 32) == 0' ; if we allowed 2 pairs of 16-byte loads per block. define i32 @eq_i128_pair(i128* %a, i128* %b) { ; SSE2-LABEL: eq_i128_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %rcx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 8(%rsi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq 16(%rdi), %rdi -; SSE2-NEXT: xorq 16(%rsi), %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm2 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %ecx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i128_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: xorq (%rsi), %rax -; AVX2-NEXT: xorq 8(%rsi), %rcx -; AVX2-NEXT: movq 24(%rdi), %rdx -; AVX2-NEXT: movq 16(%rdi), %rdi -; AVX2-NEXT: xorq 16(%rsi), %rdi -; AVX2-NEXT: orq %rax, %rdi -; AVX2-NEXT: xorq 24(%rsi), %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVXANY-LABEL: eq_i128_pair: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1 +; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXANY-NEXT: vpmovmskb %xmm0, %ecx +; AVXANY-NEXT: xorl %eax, %eax +; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; AVXANY-NEXT: sete %al +; AVXANY-NEXT: retq %a0 = load i128, i128* %a %b0 = load i128, i128* %b %xor1 = xor i128 %a0, %b0 @@ -240,7 +279,7 @@ define i32 @eq_i128_pair(i128* %a, i128* %b) { ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 64) != 0' +; This test models the expansion of 'memcmp(a, b, 64) != 0' ; if we allowed 2 pairs of 32-byte loads per block. define i32 @ne_i256_pair(i256* %a, i256* %b) { @@ -273,34 +312,48 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) { ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: ne_i256_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq 16(%rdi), %r9 -; AVX2-NEXT: movq 24(%rdi), %r11 -; AVX2-NEXT: movq (%rdi), %r8 -; AVX2-NEXT: movq 8(%rdi), %r10 -; AVX2-NEXT: xorq 8(%rsi), %r10 -; AVX2-NEXT: xorq 24(%rsi), %r11 -; AVX2-NEXT: xorq (%rsi), %r8 -; AVX2-NEXT: xorq 16(%rsi), %r9 -; AVX2-NEXT: movq 48(%rdi), %rdx -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: movq 40(%rdi), %rdi -; AVX2-NEXT: xorq 40(%rsi), %rdi -; AVX2-NEXT: xorq 56(%rsi), %rcx -; AVX2-NEXT: orq %r11, %rcx -; AVX2-NEXT: orq %rdi, %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: xorq 32(%rsi), %rax -; AVX2-NEXT: xorq 48(%rsi), %rdx -; AVX2-NEXT: orq %r9, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; AVX1-LABEL: ne_i256_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 16(%rdi), %r9 +; AVX1-NEXT: movq 24(%rdi), %r11 +; AVX1-NEXT: movq (%rdi), %r8 +; AVX1-NEXT: movq 8(%rdi), %r10 +; AVX1-NEXT: xorq 8(%rsi), %r10 +; AVX1-NEXT: xorq 24(%rsi), %r11 +; AVX1-NEXT: xorq (%rsi), %r8 +; AVX1-NEXT: xorq 16(%rsi), %r9 +; AVX1-NEXT: movq 48(%rdi), %rdx +; AVX1-NEXT: movq 32(%rdi), %rax +; AVX1-NEXT: movq 56(%rdi), %rcx +; AVX1-NEXT: movq 40(%rdi), %rdi +; AVX1-NEXT: xorq 40(%rsi), %rdi +; AVX1-NEXT: xorq 56(%rsi), %rcx +; AVX1-NEXT: orq %r11, %rcx +; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %r10, %rcx +; AVX1-NEXT: xorq 32(%rsi), %rax +; AVX1-NEXT: xorq 48(%rsi), %rdx +; AVX1-NEXT: orq %r9, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: setne %al +; AVX1-NEXT: retq +; +; AVX256-LABEL: ne_i256_pair: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqu (%rdi), %ymm0 +; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 +; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: setne %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b %xor1 = xor i256 %a0, %b0 @@ -315,7 +368,7 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) { ret i32 %z } -; This test models the expansion of 'memcmp(a, b, 64) == 0' +; This test models the expansion of 'memcmp(a, b, 64) == 0' ; if we allowed 2 pairs of 32-byte loads per block. define i32 @eq_i256_pair(i256* %a, i256* %b) { @@ -348,34 +401,48 @@ define i32 @eq_i256_pair(i256* %a, i256* %b) { ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; -; AVX2-LABEL: eq_i256_pair: -; AVX2: # %bb.0: -; AVX2-NEXT: movq 16(%rdi), %r9 -; AVX2-NEXT: movq 24(%rdi), %r11 -; AVX2-NEXT: movq (%rdi), %r8 -; AVX2-NEXT: movq 8(%rdi), %r10 -; AVX2-NEXT: xorq 8(%rsi), %r10 -; AVX2-NEXT: xorq 24(%rsi), %r11 -; AVX2-NEXT: xorq (%rsi), %r8 -; AVX2-NEXT: xorq 16(%rsi), %r9 -; AVX2-NEXT: movq 48(%rdi), %rdx -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: movq 40(%rdi), %rdi -; AVX2-NEXT: xorq 40(%rsi), %rdi -; AVX2-NEXT: xorq 56(%rsi), %rcx -; AVX2-NEXT: orq %r11, %rcx -; AVX2-NEXT: orq %rdi, %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: xorq 32(%rsi), %rax -; AVX2-NEXT: xorq 48(%rsi), %rdx -; AVX2-NEXT: orq %r9, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVX1-LABEL: eq_i256_pair: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 16(%rdi), %r9 +; AVX1-NEXT: movq 24(%rdi), %r11 +; AVX1-NEXT: movq (%rdi), %r8 +; AVX1-NEXT: movq 8(%rdi), %r10 +; AVX1-NEXT: xorq 8(%rsi), %r10 +; AVX1-NEXT: xorq 24(%rsi), %r11 +; AVX1-NEXT: xorq (%rsi), %r8 +; AVX1-NEXT: xorq 16(%rsi), %r9 +; AVX1-NEXT: movq 48(%rdi), %rdx +; AVX1-NEXT: movq 32(%rdi), %rax +; AVX1-NEXT: movq 56(%rdi), %rcx +; AVX1-NEXT: movq 40(%rdi), %rdi +; AVX1-NEXT: xorq 40(%rsi), %rdi +; AVX1-NEXT: xorq 56(%rsi), %rcx +; AVX1-NEXT: orq %r11, %rcx +; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %r10, %rcx +; AVX1-NEXT: xorq 32(%rsi), %rax +; AVX1-NEXT: xorq 48(%rsi), %rdx +; AVX1-NEXT: orq %r9, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: orq %r8, %rdx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX256-LABEL: eq_i256_pair: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqu (%rdi), %ymm0 +; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 +; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpmovmskb %ymm0, %ecx +; AVX256-NEXT: xorl %eax, %eax +; AVX256-NEXT: cmpl $-1, %ecx +; AVX256-NEXT: sete %al +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a0 = load i256, i256* %a %b0 = load i256, i256* %b %xor1 = xor i256 %a0, %b0 diff --git a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll index a65c1d312aa40..f7f9dff9beb08 100644 --- a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll @@ -2,17 +2,6 @@ ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_storeu_ps: -; SSE: ## %bb.0: -; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-NEXT: movups %xmm0, (%eax) -; SSE-NEXT: retl -; -; KNL-LABEL: test_x86_sse_storeu_ps: -; KNL: ## %bb.0: -; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL-NEXT: vmovups %xmm0, (%eax) -; KNL-NEXT: retl ; CHECK-LABEL: test_x86_sse_storeu_ps: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -25,20 +14,6 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_add_ss: -; SSE: ## %bb.0: -; SSE-NEXT: addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_add_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_add_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x58,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_add_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: addss %xmm1, %xmm0 @@ -50,20 +25,6 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_sub_ss: -; SSE: ## %bb.0: -; SSE-NEXT: subss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5c,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_sub_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5c,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_sub_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5c,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_sub_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: subss %xmm1, %xmm0 @@ -75,20 +36,6 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_mul_ss: -; SSE: ## %bb.0: -; SSE-NEXT: mulss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x59,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_mul_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x59,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_mul_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x59,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_mul_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: mulss %xmm1, %xmm0 @@ -100,20 +47,6 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: test_x86_sse_div_ss: -; SSE: ## %bb.0: -; SSE-NEXT: divss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5e,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_div_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5e,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_div_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5e,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse_div_ss: ; CHECK: ## %bb.0: ; CHECK-NEXT: divss %xmm1, %xmm0 @@ -123,4 +56,3 @@ define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) { } declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone - diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index a75a0597325d1..1acf1ad43f6de 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2364,8 +2364,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind { ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set1_epi8: @@ -2373,8 +2373,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind { ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: retq %res0 = insertelement <16 x i8> undef, i8 %a0, i32 0 %res1 = insertelement <16 x i8> %res0, i8 %a0, i32 1 @@ -2401,15 +2401,15 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X32: # %bb.0: ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 -; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set1_epi16: ; X64: # %bb.0: ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: retq %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0 %res1 = insertelement <8 x i16> %res0, i16 %a0, i32 1 diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index 3571e2968bf84..3dd3be6853f04 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -184,20 +184,6 @@ define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) { declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_add_sd: -; SSE: ## %bb.0: -; SSE-NEXT: addsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x58,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_add_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x58,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_add_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_add_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: addsd %xmm1, %xmm0 @@ -209,20 +195,6 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_sub_sd: -; SSE: ## %bb.0: -; SSE-NEXT: subsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5c,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_sub_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5c,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_sub_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_sub_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: subsd %xmm1, %xmm0 @@ -234,20 +206,6 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_mul_sd: -; SSE: ## %bb.0: -; SSE-NEXT: mulsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x59,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_mul_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x59,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_mul_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_mul_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: mulsd %xmm1, %xmm0 @@ -259,20 +217,6 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) { -; SSE-LABEL: test_x86_sse2_div_sd: -; SSE: ## %bb.0: -; SSE-NEXT: divsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5e,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_div_sd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5e,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_div_sd: -; SKX: ## %bb.0: -; SKX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] ; CHECK-LABEL: test_x86_sse2_div_sd: ; CHECK: ## %bb.0: ; CHECK-NEXT: divsd %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index bdfc96ba97d5f..51f228b414ec0 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -919,12 +919,10 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { ; ; AVX512DQ-LABEL: fptosi_2f32_to_2i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64: @@ -1448,12 +1446,10 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64: diff --git a/test/CodeGen/X86/vec_set-H.ll b/test/CodeGen/X86/vec_set-H.ll index 03324f02a4fef..d96c8bbc61718 100644 --- a/test/CodeGen/X86/vec_set-H.ll +++ b/test/CodeGen/X86/vec_set-H.ll @@ -5,8 +5,8 @@ define <2 x i64> @doload64(i16 signext %x) nounwind { ; CHECK-LABEL: doload64: ; CHECK: # %bb.0: ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: retl %tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0 %tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1 diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index 2178eb70cdecc..2cbf306c8ba00 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -5459,38 +5459,30 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512F-NEXT: vpslld $31, %zmm4, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512F-NEXT: kmovw %k0, 14(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512F-NEXT: kmovw %k0, 12(%rdi) ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512F-NEXT: kmovw %k0, 10(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, 8(%rdi) ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, 6(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, 4(%rdi) ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, 2(%rdi) ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rdi) ; AVX512F-NEXT: movq %rdi, %rax @@ -5505,38 +5497,30 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm4 -; AVX512DQ-NEXT: vpslld $31, %zmm4, %zmm4 ; AVX512DQ-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512DQ-NEXT: kmovw %k0, 14(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512DQ-NEXT: kmovw %k0, 12(%rdi) ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0 ; AVX512DQ-NEXT: kmovw %k0, 10(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512DQ-NEXT: kmovw %k0, 8(%rdi) ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512DQ-NEXT: kmovw %k0, 6(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: movq %rdi, %rax diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll index 782c72e2a4d4f..b2c0a4d096c2b 100644 --- a/test/CodeGen/X86/vector-pcmp.ll +++ b/test/CodeGen/X86/vector-pcmp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 ; Lower common integer comparisons such as 'isPositive' efficiently: ; https://llvm.org/bugs/show_bug.cgi?id=26701 @@ -84,50 +84,13 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %x) { } define <1 x i128> @test_strange_type(<1 x i128> %x) { -; SSE2-LABEL: test_strange_type: -; SSE2: # %bb.0: -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: notq %rsi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: movq %rsi, %rdx -; SSE2-NEXT: retq -; -; SSE42-LABEL: test_strange_type: -; SSE42: # %bb.0: -; SSE42-NEXT: sarq $63, %rsi -; SSE42-NEXT: movq %rsi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE42-NEXT: pxor %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, %rax -; SSE42-NEXT: pextrq $1, %xmm1, %rdx -; SSE42-NEXT: retq -; -; AVX1-LABEL: test_strange_type: -; AVX1: # %bb.0: -; AVX1-NEXT: sarq $63, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_strange_type: -; AVX2: # %bb.0: -; AVX2-NEXT: sarq $63, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm0 -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: retq +; CHECK-LABEL: test_strange_type: +; CHECK: # %bb.0: +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: notq %rsi +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: movq %rsi, %rdx +; CHECK-NEXT: retq %sign = ashr <1 x i128> %x, <i128 127> %not = xor <1 x i128> %sign, <i128 -1> ret <1 x i128> %not diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index ca670f40ab3fb..8aa8682b6e441 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -796,8 +796,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE2-NEXT: psllw $5, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] @@ -1011,8 +1011,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; X32-SSE-NEXT: psllw $5, %xmm3 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 890cedf97c9dc..1e5dbea6bc618 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -663,8 +663,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -816,8 +816,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; X32-SSE-NEXT: psllw $5, %xmm2 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index 9481e46c0c52e..724fd3454eec4 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -612,8 +612,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -758,8 +758,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; X32-SSE-NEXT: psllw $5, %xmm2 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 2f5a2b1161159..2ff7ef4328f53 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -12,8 +12,8 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00( ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -206,14 +206,14 @@ define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07( define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_0101010101010101: ; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_0101010101010101: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101: @@ -257,8 +257,8 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07( ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -372,14 +372,12 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20( ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: @@ -1181,21 +1179,20 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: psrlq $16, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: psrlq $16, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,1,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] @@ -1203,7 +1200,7 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -1489,8 +1486,8 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_i32: @@ -1531,8 +1528,8 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8: @@ -1576,8 +1573,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32: @@ -1614,8 +1611,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32: @@ -1653,8 +1650,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: @@ -1706,8 +1703,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: @@ -1761,15 +1758,14 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) ; SSE2-NEXT: shll $8, %ecx ; SSE2-NEXT: orl %eax, %ecx ; SSE2-NEXT: movzwl %cx, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7] ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1813,13 +1809,13 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movzbl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 072d71fae570a..fc22040578b14 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -63,14 +63,14 @@ define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_00000000: ; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_00000000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v8i16_00000000: @@ -1123,33 +1123,44 @@ define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) { } define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { -; SSE-LABEL: shuffle_v8i16_0213cedf: -; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_0213cedf: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0213cedf: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0213cedf: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_0213cedf: ; AVX1: # %bb.0: +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i16_0213cedf: ; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: retq @@ -1157,14 +1168,14 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { ; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] ; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] ; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15] ; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-FAST-NEXT: retq @@ -2111,79 +2122,115 @@ define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) { } define <8 x i16> @shuffle_v8i16_01100110(<8 x i16> %a) { -; SSE2-LABEL: shuffle_v8i16_01100110: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_01100110: +; SSE: # %bb.0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_01100110: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_01100110: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_01100110: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_01100110: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_01100110: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_01100110: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_01100110: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_01100110: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 1, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_01u0u110(<8 x i16> %a) { -; SSE2-LABEL: shuffle_v8i16_01u0u110: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_01u0u110: +; SSE: # %bb.0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_01u0u110: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_01u0u110: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_01u0u110: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_01u0u110: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_01u0u110: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_01u0u110: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_01u0u110: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_01u0u110: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 undef, i32 0, i32 undef, i32 1, i32 1, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_467uu675(<8 x i16> %a) { -; SSE2-LABEL: shuffle_v8i16_467uu675: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_467uu675: +; SSE: # %bb.0: +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_467uu675: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_467uu675: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_467uu675: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_467uu675: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_467uu675: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_467uu675: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_467uu675: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_467uu675: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7, i32 5> ret <8 x i16> %shuffle } @@ -2471,15 +2518,15 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) { ; SSE-LABEL: insert_dup_mem_v8i16_i32: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i32: @@ -2498,8 +2545,8 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16: @@ -2547,15 +2594,15 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) { ; SSE-LABEL: insert_dup_elt1_mem_v8i16_i32: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32: @@ -2574,8 +2621,8 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32: @@ -2612,8 +2659,8 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: @@ -2665,8 +2712,8 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) { ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index cbd1b83a4eb2f..aafc9fc7dcd21 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -8,8 +8,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -24,17 +24,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> ret <16 x i16> %shuffle } @@ -42,17 +63,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> ret <16 x i16> %shuffle } @@ -60,17 +102,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> ret <16 x i16> %shuffle } @@ -78,8 +141,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -96,8 +159,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -114,8 +177,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -132,8 +195,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -152,9 +215,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -162,8 +226,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: retq @@ -191,9 +255,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -220,8 +285,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -254,8 +319,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -288,8 +353,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -314,8 +379,8 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -340,8 +405,8 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -366,8 +431,8 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -390,18 +455,18 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -411,8 +476,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -426,18 +491,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -447,8 +512,8 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1 ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -606,15 +671,36 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> ret <16 x i16> %shuffle } @@ -622,15 +708,36 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> ret <16 x i16> %shuffle } @@ -638,15 +745,36 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> ret <16 x i16> %shuffle } @@ -913,8 +1041,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2 ; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-SLOW-NEXT: retq ; @@ -965,16 +1093,14 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1 define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1014,20 +1140,12 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # %bb.0: @@ -1042,17 +1160,35 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8> ret <16 x i16> %shuffle } @@ -1060,17 +1196,35 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -1078,17 +1232,35 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -1256,9 +1428,11 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1273,9 +1447,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1290,9 +1466,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_0 define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1411,8 +1589,8 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1 define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1464,8 +1642,8 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_0 define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] @@ -2036,9 +2214,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -2046,8 +2225,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: retq @@ -3261,15 +3440,25 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5,16,17,18,19,18,19,16,17,24,25,26,27,22,23,20,21] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,6,7,4,5,4,5,6,7,18,19,16,17,20,21,22,23,22,23,20,21,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,2,3,0,1,8,9,10,11,6,7,4,5,18,19,16,17,18,19,16,17,24,25,26,27,22,23,20,21] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,12,13,14,15,18,19,16,17,22,23,20,21,22,23,20,21,28,29,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX512VL: # %bb.0: @@ -3344,17 +3533,16 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1 define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,5,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,4,5,14,15,0,1,4,5,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: @@ -4035,15 +4223,15 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; @@ -4055,8 +4243,8 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; @@ -4073,8 +4261,8 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -4099,14 +4287,14 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: @@ -4116,8 +4304,8 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, ; ; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: @@ -4132,15 +4320,15 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: @@ -4152,8 +4340,8 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: @@ -4236,17 +4424,35 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2 define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,7,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11,24,25,28,29,30,31,30,31,24,25,28,29,30,31,26,27] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 4, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7, i32 5, i32 12, i32 14, i32 15, i32 undef, i32 undef, i32 14, i32 15, i32 13> ret <16 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 51ef3a18438f4..948cfd10076d7 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1299,8 +1299,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1317,13 +1317,13 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -1331,8 +1331,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: retq @@ -1348,8 +1348,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; ; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX512VLBW-SLOW: # %bb.0: -; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] +; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VLBW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-SLOW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1 @@ -1896,7 +1896,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -2540,6 +2540,34 @@ define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ ret <32 x i8> %shuffle } +; PR33740 +define <32 x i8> @shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23, i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: ; AVX1: # %bb.0: @@ -2577,7 +2605,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2586,7 +2614,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_ ; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX2OR512VL: # %bb.0: ; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15] ; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index f3433ce834cd5..b41fcbe79b0ca 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -16,7 +16,6 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq @@ -51,7 +50,6 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512VL-NEXT: movq $-1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq @@ -85,7 +83,6 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq @@ -110,7 +107,6 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -125,7 +121,6 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 @@ -157,8 +152,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpslld $31, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -172,8 +166,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512VL-NEXT: vpslld $31, %zmm2, %zmm0 -; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -374,8 +367,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -389,8 +381,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 -; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -422,8 +413,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -437,7 +427,6 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -470,8 +459,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -484,7 +472,6 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -518,8 +505,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -534,7 +520,6 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -566,7 +551,6 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax @@ -578,7 +562,6 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512VL-NEXT: kmovw %edi, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 -; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %ax killed %ax killed %eax diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index fd4c30fb327bc..56395309897f0 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -1852,6 +1852,83 @@ entry: ret <16 x i8> %1 } +define <8 x i16> @PR32160(<8 x i32> %x) { +; SSE2-LABEL: PR32160: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR32160: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR32160: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE41-NEXT: retq +; +; AVX1-LABEL: PR32160: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR32160: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: PR32160: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR32160: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: PR32160: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: PR32160: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %shuf = trunc <8 x i32> %x to <8 x i16> + %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> + ret <8 x i16> %trunc +} + define void @PR34773(i16* %a0, i8* %a1) { ; SSE-LABEL: PR34773: ; SSE: # %bb.0: diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll index 96e97c70dbf4d..bd38d901cc8ba 100644 --- a/test/CodeGen/X86/widened-broadcast.ll +++ b/test/CodeGen/X86/widened-broadcast.ll @@ -304,14 +304,14 @@ entry: define <16 x i8> @load_splat_16i8_16i8_0101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_16i8_16i8_0101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_16i8_16i8_0101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_splat_16i8_16i8_0101010101010101: @@ -384,15 +384,15 @@ entry: define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -459,15 +459,15 @@ entry: define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8>* %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 792bbbed52e14..e02258a788a11 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -645,9 +645,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) { ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 -; AVX512-NEXT: vpsllw $7, %xmm3, %xmm1 -; AVX512-NEXT: vpmovb2m %zmm1, %k0 -; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512-NEXT: vpmovb2m %zmm3, %k0 ; AVX512-NEXT: vpmovb2m %zmm0, %k1 ; AVX512-NEXT: kxnorw %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 @@ -958,9 +956,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpsllw $7, %ymm8, %ymm1 -; AVX512-NEXT: vpmovb2m %zmm1, %k0 -; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovb2m %zmm8, %k0 ; AVX512-NEXT: vpmovb2m %zmm0, %k1 ; AVX512-NEXT: kxnord %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 |