summaryrefslogtreecommitdiff
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-01-06 21:34:26 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-01-06 21:34:26 +0000
commitd215fd3b74b90f5dc1964610926fcc2a20f959aa (patch)
tree0c9f21e40eae033d6760008729f37d2103e2c654 /test/CodeGen/X86
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
downloadsrc-test-d215fd3b74b90f5dc1964610926fcc2a20f959aa.tar.gz
src-test-d215fd3b74b90f5dc1964610926fcc2a20f959aa.zip
Notes
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/avx-intrinsics-fast-isel.ll8
-rw-r--r--test/CodeGen/X86/avx-splat.ll4
-rw-r--r--test/CodeGen/X86/avx-vbroadcast.ll22
-rw-r--r--test/CodeGen/X86/avx512-calling-conv.ll24
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll326
-rw-r--r--test/CodeGen/X86/avx512-ext.ll25
-rw-r--r--test/CodeGen/X86/avx512-extract-subvector-load-store.ll62
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll1
-rw-r--r--test/CodeGen/X86/avx512-shuffles/partial_permute.ll77
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll4
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll6
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll330
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll900
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-128.ll60
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll14
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll47
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll59
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool.ll16
-rw-r--r--test/CodeGen/X86/bitcast-setcc-128.ll57
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll15
-rw-r--r--test/CodeGen/X86/bitcast-setcc-512.ll6
-rw-r--r--test/CodeGen/X86/broadcastm-lowering.ll3
-rw-r--r--test/CodeGen/X86/build-vector-128.ll98
-rw-r--r--test/CodeGen/X86/build-vector-256.ll74
-rw-r--r--test/CodeGen/X86/cast-vsel.ll24
-rw-r--r--test/CodeGen/X86/cvtv2f32.ll37
-rw-r--r--test/CodeGen/X86/fixup-bw-inst.mir50
-rw-r--r--test/CodeGen/X86/memset-nonzero.ll20
-rw-r--r--test/CodeGen/X86/oddshuffles.ll2
-rw-r--r--test/CodeGen/X86/pr33349.ll2
-rw-r--r--test/CodeGen/X86/pr35765.ll45
-rw-r--r--test/CodeGen/X86/psubus.ll24
-rw-r--r--test/CodeGen/X86/setcc-wide-types.ll367
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll68
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel.ll16
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll56
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll20
-rw-r--r--test/CodeGen/X86/vec_set-H.ll4
-rw-r--r--test/CodeGen/X86/vector-compare-results.ll16
-rw-r--r--test/CodeGen/X86/vector-pcmp.ll59
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll8
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll8
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll8
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll104
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll227
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll604
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll54
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll31
-rw-r--r--test/CodeGen/X86/vector-trunc.ll77
-rw-r--r--test/CodeGen/X86/widened-broadcast.ll24
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll8
51 files changed, 2036 insertions, 2165 deletions
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 297922809ea7e..9069755ad1319 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -2364,16 +2364,16 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
-; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_epi16:
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
-; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <16 x i16> undef, i16 %a0, i32 0
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index da547397c6ce8..9cd05a353fbf5 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -15,8 +15,8 @@ entry:
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 5a9f23007d86f..e2dc40c6f0ef1 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -874,39 +874,33 @@ define float @broadcast_lifetime() nounwind {
; X32-LABEL: broadcast_lifetime:
; X32: ## %bb.0:
; X32-NEXT: pushl %esi
-; X32-NEXT: subl $56, %esp
+; X32-NEXT: subl $40, %esp
; X32-NEXT: leal {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll _gfunc
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill
+; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## 4-byte Spill
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll _gfunc
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: vpermilps $0, {{[0-9]+}}(%esp), %xmm1 ## 16-byte Folded Reload
-; X32-NEXT: ## xmm1 = mem[0,0,0,0]
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; X32-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 4-byte Folded Reload
; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: addl $56, %esp
+; X32-NEXT: addl $40, %esp
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: broadcast_lifetime:
; X64: ## %bb.0:
; X64-NEXT: subq $40, %rsp
-; X64-NEXT: movq %rsp, %rdi
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill
-; X64-NEXT: movq %rsp, %rdi
+; X64-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: vpermilps $0, {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload
-; X64-NEXT: ## xmm1 = mem[0,0,0,0]
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; X64-NEXT: vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
; X64-NEXT: addq $40, %rsp
; X64-NEXT: retq
%1 = alloca <4 x float>, align 16
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 821c65bef06a8..e3cf2181387fb 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -410,3 +410,27 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
%res1 = select i1 %cond, i32 %res, i32 0
ret i32 %res1
}
+
+define <1 x i1> @test13(<1 x i1>* %foo) {
+; KNL-LABEL: test13:
+; KNL: ## %bb.0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test13:
+; SKX: ## %bb.0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test13:
+; KNL_X32: ## %bb.0:
+; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT: movzbl (%eax), %eax
+; KNL_X32-NEXT: ## kill: def %al killed %al killed %eax
+; KNL_X32-NEXT: retl
+ %bar = load <1 x i1>, <1 x i1>* %foo
+ ret <1 x i1> %bar
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 18e9f306bc1bf..a95e22a048563 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -2018,3 +2018,329 @@ define <2 x double> @ubto2f64(<2 x i32> %a) {
%1 = uitofp <2 x i1> %mask to <2 x double>
ret <2 x double> %1
}
+
+define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f64toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttsd2usi %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm2
+; NOVLDQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; NOVLDQ-NEXT: vcvttsd2usi %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm0
+; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVLDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOVLDQ-NEXT: vpsraq $63, %zmm0, %zmm0
+; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NOVLDQ-NEXT: vzeroupper
+; NOVLDQ-NEXT: retq
+;
+; VL-LABEL: test_2f64toub:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2udq %xmm0, %xmm0
+; VL-NEXT: vpslld $31, %xmm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: test_2f64toub:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpsraq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+ %mask = fptoui <2 x double> %a to <2 x i1>
+ %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+ ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) {
+; NOVL-LABEL: test_4f64toub:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
+; NOVL-NEXT: vpsrad $31, %xmm0, %xmm0
+; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0
+; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_4f64toub:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; VL-NEXT: vpslld $31, %xmm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptoui <4 x double> %a to <4 x i1>
+ %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+ ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) {
+; NOVL-LABEL: test_8f64toub:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_8f64toub:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT: vpslld $31, %ymm0, %ymm0
+; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptoui <8 x double> %a to <8 x i1>
+ %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+ ret <8 x i64> %select
+}
+
+define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f32toub:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttss2usi %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm2
+; NOVLDQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NOVLDQ-NEXT: vcvttss2usi %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm0
+; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVLDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOVLDQ-NEXT: vpsraq $63, %zmm0, %zmm0
+; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NOVLDQ-NEXT: vzeroupper
+; NOVLDQ-NEXT: retq
+;
+; VL-LABEL: test_2f32toub:
+; VL: # %bb.0:
+; VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; VL-NEXT: vpslld $31, %xmm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: test_2f32toub:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpsraq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+ %mask = fptoui <2 x float> %a to <2 x i1>
+ %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+ ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) {
+; NOVL-LABEL: test_4f32toub:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
+; NOVL-NEXT: vpsrad $31, %xmm0, %xmm0
+; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0
+; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_4f32toub:
+; VL: # %bb.0:
+; VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; VL-NEXT: vpslld $31, %xmm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptoui <4 x float> %a to <4 x i1>
+ %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+ ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) {
+; NOVL-LABEL: test_8f32toub:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0
+; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_8f32toub:
+; VL: # %bb.0:
+; VL-NEXT: vcvttps2dq %ymm0, %ymm0
+; VL-NEXT: vpslld $31, %ymm0, %ymm0
+; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptoui <8 x float> %a to <8 x i1>
+ %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+ ret <8 x i64> %select
+}
+
+define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) {
+; ALL-LABEL: test_16f32toub:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: vpslld $31, %zmm0, %zmm0
+; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; ALL-NEXT: retq
+ %mask = fptoui <16 x float> %a to <16 x i1>
+ %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
+ ret <16 x i32> %select
+}
+
+define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f64tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttsd2si %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm2
+; NOVLDQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; NOVLDQ-NEXT: vcvttsd2si %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm0
+; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NOVLDQ-NEXT: retq
+;
+; VL-LABEL: test_2f64tosb:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VL-NEXT: vpslld $31, %xmm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: test_2f64tosb:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+ %mask = fptosi <2 x double> %a to <2 x i1>
+ %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+ ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) {
+; NOVL-LABEL: test_4f64tosb:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0
+; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_4f64tosb:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptosi <4 x double> %a to <4 x i1>
+ %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+ ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) {
+; NOVL-LABEL: test_8f64tosb:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_8f64tosb:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptosi <8 x double> %a to <8 x i1>
+ %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+ ret <8 x i64> %select
+}
+
+define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) {
+; NOVLDQ-LABEL: test_2f32tosb:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vcvttss2si %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm2
+; NOVLDQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NOVLDQ-NEXT: vcvttss2si %xmm0, %rax
+; NOVLDQ-NEXT: vmovq %rax, %xmm0
+; NOVLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NOVLDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NOVLDQ-NEXT: retq
+;
+; VL-LABEL: test_2f32tosb:
+; VL: # %bb.0:
+; VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: test_2f32tosb:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+ %mask = fptosi <2 x float> %a to <2 x i1>
+ %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
+ ret <2 x i64> %select
+}
+
+define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) {
+; NOVL-LABEL: test_4f32tosb:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
+; NOVL-NEXT: vpmovsxdq %xmm0, %ymm0
+; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_4f32tosb:
+; VL: # %bb.0:
+; VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptosi <4 x float> %a to <4 x i1>
+ %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
+ ret <4 x i64> %select
+}
+
+define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) {
+; NOVL-LABEL: test_8f32tosb:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0
+; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; NOVL-NEXT: retq
+;
+; VL-LABEL: test_8f32tosb:
+; VL: # %bb.0:
+; VL-NEXT: vcvttps2dq %ymm0, %ymm0
+; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
+; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; VL-NEXT: retq
+ %mask = fptosi <8 x float> %a to <8 x i1>
+ %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
+ ret <8 x i64> %select
+}
+
+define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) {
+; ALL-LABEL: test_16f32tosb:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; ALL-NEXT: retq
+ %mask = fptosi <16 x float> %a to <16 x i1>
+ %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
+ ret <16 x i32> %select
+}
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index 8c79415912179..a966235df2160 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -345,9 +345,8 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
@@ -369,9 +368,8 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovsxbd (%rdi), %ymm1
-; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovsxbd (%rdi), %ymm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
@@ -702,9 +700,8 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
@@ -726,9 +723,8 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovsxwd (%rdi), %ymm1
-; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovsxwd (%rdi), %ymm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
@@ -760,9 +756,8 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index e1ed8ea98a1c3..b49e2ceca0bff 100644
--- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -6,7 +6,7 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $4, %k0, %k0
+; AVX512-NEXT: kshiftrb $4, %k0, %k0
; AVX512-NEXT: vpmovm2q %k0, %xmm2
; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512-NEXT: vpmovq2m %xmm2, %k1
@@ -22,7 +22,6 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -37,7 +36,7 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $6, %k0, %k0
+; AVX512-NEXT: kshiftrb $6, %k0, %k0
; AVX512-NEXT: vpmovm2q %k0, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512-NEXT: vpmovq2m %xmm2, %k1
@@ -53,7 +52,6 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x doub
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -83,7 +81,6 @@ define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x do
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -113,7 +110,6 @@ define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x flo
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
@@ -143,7 +139,6 @@ define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x d
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -173,7 +168,6 @@ define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x fl
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
@@ -203,7 +197,6 @@ define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -233,7 +226,6 @@ define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
@@ -264,7 +256,6 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@@ -295,7 +286,6 @@ define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x d
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -325,7 +315,6 @@ define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x fl
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
@@ -358,7 +347,6 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@@ -389,7 +377,6 @@ define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -419,7 +406,6 @@ define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
@@ -450,7 +436,6 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@@ -481,7 +466,6 @@ define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x
; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
@@ -512,7 +496,6 @@ define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x d
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
@@ -542,7 +525,6 @@ define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x fl
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
@@ -575,7 +557,6 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@@ -608,7 +589,6 @@ define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x
; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2
-; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
@@ -624,7 +604,7 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $1, %k0, %k0
+; AVX512-NEXT: kshiftrb $1, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -645,7 +625,7 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $1, %k0, %k0
+; AVX512-NEXT: kshiftrb $1, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -666,7 +646,7 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $2, %k0, %k0
+; AVX512-NEXT: kshiftrb $2, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -687,7 +667,7 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $2, %k0, %k0
+; AVX512-NEXT: kshiftrb $2, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -708,7 +688,7 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $3, %k0, %k0
+; AVX512-NEXT: kshiftrb $3, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -729,7 +709,7 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $4, %k0, %k0
+; AVX512-NEXT: kshiftrb $4, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -750,7 +730,7 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $4, %k0, %k0
+; AVX512-NEXT: kshiftrb $4, %k0, %k0
; AVX512-NEXT: vpmovm2q %k0, %xmm0
; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT: vpmovq2m %xmm0, %k0
@@ -765,7 +745,6 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -779,7 +758,7 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $7, %k0, %k0
+; AVX512-NEXT: kshiftrb $7, %k0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: retq
;
@@ -800,7 +779,7 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovb (%rdi), %k0
-; AVX512-NEXT: kshiftrw $6, %k0, %k0
+; AVX512-NEXT: kshiftrb $6, %k0, %k0
; AVX512-NEXT: vpmovm2q %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512-NEXT: vpmovq2m %xmm0, %k0
@@ -815,7 +794,6 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -863,7 +841,6 @@ define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -891,7 +868,6 @@ define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -939,7 +915,6 @@ define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -967,7 +942,6 @@ define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1015,7 +989,6 @@ define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1043,7 +1016,6 @@ define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1072,7 +1044,6 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1121,7 +1092,6 @@ define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1149,7 +1119,6 @@ define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1180,7 +1149,6 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1229,7 +1197,6 @@ define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1257,7 +1224,6 @@ define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1286,7 +1252,6 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1315,7 +1280,6 @@ define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1)
; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0
-; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
@@ -1363,7 +1327,6 @@ define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1391,7 +1354,6 @@ define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1422,7 +1384,6 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1453,7 +1414,6 @@ define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1)
; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
; AVX512NOTDQ-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 7477e05f0c7f9..9e11c799e1798 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -805,7 +805,6 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 0601c011e2903..333efb04913d3 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -3,26 +3,28 @@
; FIXME: All cases here should be fixed by PR34380
-define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
-; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
ret <8 x i16> %res
}
-define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
@@ -35,13 +37,14 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i
ret <8 x i16> %res
}
-define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,6,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
@@ -55,14 +58,14 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x
}
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -74,14 +77,14 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index 6bee0de181abc..f6cb093d521b3 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -136,7 +136,7 @@ define <4 x i1> @test9(<8 x i1> %a, <8 x i1> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %xmm0, %k0
-; CHECK-NEXT: kshiftrw $4, %k0, %k0
+; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: retq
%res = shufflevector <8 x i1> %a, <8 x i1> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -148,7 +148,7 @@ define <2 x i1> @test10(<4 x i1> %a, <4 x i1> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
-; CHECK-NEXT: kshiftrw $2, %k0, %k0
+; CHECK-NEXT: kshiftrb $2, %k0, %k0
; CHECK-NEXT: vpmovm2q %k0, %xmm0
; CHECK-NEXT: retq
%res = shufflevector <4 x i1> %a, <4 x i1> %b, <2 x i32> <i32 2, i32 3>
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 826a4538f3f1c..6f0f873c2f70f 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -195,14 +195,12 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: movl (%rsp), %eax
@@ -235,28 +233,24 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: movl (%rsp), %ecx
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index fdd6f7126457d..8c13d4b842fcb 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -871,23 +871,14 @@ define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_mask_broadcastd_epi32:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastd_epi32:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -903,23 +894,14 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64>
define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maskz_broadcastd_epi32:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastd_epi32:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i4
@@ -1007,23 +989,14 @@ define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_mask_broadcastq_epi64:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastq_epi64:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i2
@@ -1036,23 +1009,14 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64>
define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maskz_broadcastq_epi64:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastq_epi64:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i2
@@ -1079,23 +1043,14 @@ define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm256_mask_broadcastq_epi64:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastq_epi64:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1108,23 +1063,14 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i
define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastq_epi64:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i4
@@ -1151,23 +1097,14 @@ define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
; X32-LABEL: test_mm_mask_broadcastsd_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastsd_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i2
@@ -1180,23 +1117,14 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x
define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_maskz_broadcastsd_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastsd_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i2
@@ -1223,23 +1151,14 @@ define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) {
; X32-LABEL: test_mm256_mask_broadcastsd_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastsd_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1252,23 +1171,14 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2
define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastsd_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i4
@@ -1295,23 +1205,14 @@ define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_mask_broadcastss_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastss_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1324,23 +1225,14 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl
define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_maskz_broadcastss_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastss_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i4
@@ -1419,23 +1311,14 @@ define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
; X32-LABEL: test_mm_mask_movddup_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movddup_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i2
@@ -1448,23 +1331,14 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub
define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_maskz_movddup_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movddup_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a0 to i2
@@ -1491,23 +1365,14 @@ define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
; X32-LABEL: test_mm256_mask_movddup_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_movddup_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1520,23 +1385,14 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d
define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
; X32-LABEL: test_mm256_maskz_movddup_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_movddup_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
; X64-NEXT: retq
%trn1 = trunc i8 %a0 to i4
@@ -1563,23 +1419,14 @@ define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_mask_movehdup_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movehdup_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1592,23 +1439,14 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float
define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_maskz_movehdup_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movehdup_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i4
@@ -1687,23 +1525,14 @@ define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_mask_moveldup_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_moveldup_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1716,23 +1545,14 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float
define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_maskz_moveldup_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_moveldup_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i4
@@ -1811,23 +1631,14 @@ define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) {
; X32-LABEL: test_mm256_mask_permutex_epi64:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_epi64:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1840,23 +1651,14 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64
define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_permutex_epi64:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_epi64:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a0 to i4
@@ -1883,23 +1685,14 @@ define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
; X32-LABEL: test_mm256_mask_permutex_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -1912,23 +1705,14 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x
define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
; X32-LABEL: test_mm256_maskz_permutex_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a0 to i4
@@ -1955,23 +1739,14 @@ define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) {
; X32-LABEL: test_mm_mask_shuffle_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i2
@@ -1984,23 +1759,14 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub
define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
; X32-LABEL: test_mm_maskz_shuffle_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $3, %al
-; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $3, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
; X64-NEXT: retq
%trn1 = trunc i8 %a0 to i2
@@ -2027,23 +1793,14 @@ define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) {
; X32-LABEL: test_mm256_mask_shuffle_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_shuffle_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -2056,23 +1813,14 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d
define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
; X32-LABEL: test_mm256_maskz_shuffle_pd:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_shuffle_pd:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
; X64-NEXT: retq
%trn1 = trunc i8 %a0 to i4
@@ -2099,23 +1847,14 @@ define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) {
; X32-LABEL: test_mm_mask_shuffle_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
; X64-NEXT: retq
%trn1 = trunc i8 %a1 to i4
@@ -2128,23 +1867,14 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float>
define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_maskz_shuffle_ps:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $15, %al
-; X32-NEXT: movb %al, (%esp)
-; X32-NEXT: movzbl (%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
-; X32-NEXT: popl %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_ps:
; X64: # %bb.0:
-; X64-NEXT: andb $15, %dil
-; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
; X64-NEXT: retq
%trn0 = trunc i8 %a0 to i4
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 5f4b050b863d1..ea1ff4e56b959 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -22,7 +22,6 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -59,7 +58,6 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -96,7 +94,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -104,7 +101,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -142,7 +138,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -150,7 +145,6 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -193,7 +187,6 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -235,7 +228,6 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -275,7 +267,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -285,7 +276,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -326,7 +316,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -336,7 +325,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -380,11 +368,9 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -427,11 +413,9 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -587,7 +571,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -614,7 +597,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -643,7 +625,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -674,7 +655,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -712,7 +692,6 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -776,7 +755,6 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -842,7 +820,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -910,7 +887,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -979,7 +955,6 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -1048,7 +1023,6 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -1119,7 +1093,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -1192,7 +1165,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -1271,7 +1243,6 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -1311,7 +1282,6 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -1349,7 +1319,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -1357,7 +1326,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -1396,7 +1364,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -1404,7 +1371,6 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -1450,7 +1416,6 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -1495,7 +1460,6 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -1536,7 +1500,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -1546,7 +1509,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -1588,7 +1550,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -1598,7 +1559,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -1814,13 +1774,11 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -1949,14 +1907,12 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -3117,11 +3073,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -3158,11 +3112,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -3214,7 +3166,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -3272,7 +3223,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -3319,11 +3269,9 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -3377,7 +3325,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -3427,11 +3374,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -3474,11 +3419,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -3536,7 +3479,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -3600,7 +3542,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -3653,11 +3594,9 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -3717,7 +3656,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -4802,7 +4740,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -4842,7 +4779,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -4885,7 +4821,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -4929,7 +4864,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -4973,7 +4907,6 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -5017,7 +4950,6 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -5064,7 +4996,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -5109,7 +5040,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -5157,7 +5087,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -5206,7 +5135,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -5255,7 +5183,6 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -5304,7 +5231,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -5333,8 +5259,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
@@ -5361,8 +5286,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %
; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
@@ -5391,8 +5315,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
@@ -5430,8 +5353,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
@@ -5470,8 +5392,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b)
; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
@@ -5502,8 +5423,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
@@ -6090,11 +6010,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -6131,11 +6049,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -6181,7 +6097,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -6233,7 +6148,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -6280,11 +6194,9 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -6332,7 +6244,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -6382,11 +6293,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -6429,11 +6338,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -6485,7 +6392,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -6543,7 +6449,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -6596,11 +6501,9 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -6654,7 +6557,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7476,7 +7378,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7519,7 +7420,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7577,7 +7477,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7637,7 +7536,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7686,7 +7584,6 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7746,7 +7643,6 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7798,7 +7694,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7847,7 +7742,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7911,7 +7805,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -7977,7 +7870,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -8032,7 +7924,6 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -8098,7 +7989,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -9132,7 +9022,6 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -9169,7 +9058,6 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -9206,7 +9094,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -9214,7 +9101,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -9252,7 +9138,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -9260,7 +9145,6 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -9303,7 +9187,6 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -9345,7 +9228,6 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -9385,7 +9267,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -9395,7 +9276,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -9436,7 +9316,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -9446,7 +9325,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -9490,11 +9368,9 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -9537,11 +9413,9 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -9697,7 +9571,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -9724,7 +9597,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -9753,7 +9625,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -9784,7 +9655,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -9822,7 +9692,6 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -9886,7 +9755,6 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -9952,7 +9820,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -10020,7 +9887,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -10089,7 +9955,6 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -10158,7 +10023,6 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -10229,7 +10093,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -10302,7 +10165,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -10381,7 +10243,6 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -10421,7 +10282,6 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -10459,7 +10319,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -10467,7 +10326,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -10506,7 +10364,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -10514,7 +10371,6 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -10560,7 +10416,6 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -10605,7 +10460,6 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -10646,7 +10500,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -10656,7 +10509,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -10698,7 +10550,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -10708,7 +10559,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -10924,13 +10774,11 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -11059,14 +10907,12 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -12227,11 +12073,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -12268,11 +12112,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -12324,7 +12166,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -12382,7 +12223,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -12429,11 +12269,9 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -12487,7 +12325,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -12537,11 +12374,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -12584,11 +12419,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -12646,7 +12479,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -12710,7 +12542,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -12763,11 +12594,9 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -12827,7 +12656,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -13912,7 +13740,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -13952,7 +13779,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -13995,7 +13821,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -14039,7 +13864,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -14083,7 +13907,6 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -14127,7 +13950,6 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -14174,7 +13996,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -14219,7 +14040,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -14267,7 +14087,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -14316,7 +14135,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -14365,7 +14183,6 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -14414,7 +14231,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -14443,8 +14259,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
@@ -14471,8 +14286,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
@@ -14501,8 +14315,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
@@ -14540,8 +14353,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
@@ -14580,8 +14392,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b
; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
@@ -14612,8 +14423,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
@@ -15200,11 +15010,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -15241,11 +15049,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -15291,7 +15097,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -15343,7 +15148,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -15390,11 +15194,9 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -15442,7 +15244,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -15492,11 +15293,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -15539,11 +15338,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -15595,7 +15392,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -15653,7 +15449,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -15706,11 +15501,9 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -15764,7 +15557,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16586,7 +16378,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16629,7 +16420,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16687,7 +16477,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16747,7 +16536,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16796,7 +16584,6 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16856,7 +16643,6 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16908,7 +16694,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -16957,7 +16742,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -17021,7 +16805,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -17087,7 +16870,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -17142,7 +16924,6 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -17208,7 +16989,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -18331,7 +18111,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -18380,7 +18159,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -18522,7 +18300,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -18576,7 +18353,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -19644,7 +19420,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -19694,7 +19469,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -19843,7 +19617,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -19898,7 +19671,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -21451,7 +21223,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21495,7 +21266,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21551,7 +21321,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21610,7 +21379,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21659,7 +21427,6 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21717,7 +21484,6 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21769,7 +21535,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21819,7 +21584,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21881,7 +21645,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -21946,7 +21709,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -22001,7 +21763,6 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -22065,7 +21826,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -23150,7 +22910,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -23190,7 +22949,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -23233,7 +22991,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -23277,7 +23034,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -23321,7 +23077,6 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -23365,7 +23120,6 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -23412,7 +23166,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -23457,7 +23210,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -23505,7 +23257,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -23554,7 +23305,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -23603,7 +23353,6 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -23652,7 +23401,6 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -23681,8 +23429,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
@@ -23711,8 +23458,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
@@ -23744,8 +23490,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
@@ -23783,8 +23528,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
@@ -23824,8 +23568,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b
; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
@@ -23858,8 +23601,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
@@ -24464,7 +24206,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24508,7 +24249,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24558,7 +24298,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24611,7 +24350,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24660,7 +24398,6 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24712,7 +24449,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24764,7 +24500,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24814,7 +24549,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24870,7 +24604,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24929,7 +24662,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -24984,7 +24716,6 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -25042,7 +24773,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -25894,7 +25624,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -25940,7 +25669,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26000,7 +25728,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26063,7 +25790,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26114,7 +25840,6 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26176,7 +25901,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26230,7 +25954,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26282,7 +26005,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26348,7 +26070,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26417,7 +26138,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26474,7 +26194,6 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -26542,7 +26261,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -27579,7 +27297,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -27619,7 +27336,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -27659,7 +27375,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -27667,7 +27382,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -27708,7 +27422,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -27716,7 +27429,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -27762,7 +27474,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -27807,7 +27518,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -27850,7 +27560,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -27860,7 +27569,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -27904,7 +27612,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -27914,7 +27621,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -27961,11 +27667,9 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -28011,11 +27715,9 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -28180,7 +27882,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -28210,7 +27911,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -28242,7 +27942,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -28276,7 +27975,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -28317,7 +28015,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28384,7 +28081,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28453,7 +28149,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -28524,7 +28219,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -28596,7 +28290,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28668,7 +28361,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28742,7 +28434,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -28818,7 +28509,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftrw $7, %k0, %k1
@@ -28900,7 +28590,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -28943,7 +28632,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -28984,7 +28672,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -28992,7 +28679,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -29034,7 +28720,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -29042,7 +28727,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -29091,7 +28775,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -29139,7 +28822,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -29183,7 +28865,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -29193,7 +28874,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -29238,7 +28918,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k0
@@ -29248,7 +28927,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -29466,7 +29144,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
@@ -29475,7 +29152,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -29607,7 +29283,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -29616,7 +29291,6 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
@@ -30826,11 +30500,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -30870,11 +30542,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -30929,7 +30599,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -30990,7 +30659,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -31040,11 +30708,9 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -31101,7 +30767,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -31154,11 +30819,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -31204,11 +30867,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -31269,7 +30930,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -31336,7 +30996,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -31392,11 +31051,9 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -31459,7 +31116,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -32544,7 +32200,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -32584,7 +32239,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -32627,7 +32281,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -32671,7 +32324,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -32715,7 +32367,6 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -32759,7 +32410,6 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -32806,7 +32456,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -32851,7 +32500,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -32899,7 +32547,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -32948,7 +32595,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -32997,7 +32643,6 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -33046,7 +32691,6 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -33075,8 +32719,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
@@ -33106,8 +32749,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
@@ -33139,8 +32781,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
@@ -33181,8 +32822,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
@@ -33224,8 +32864,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b
; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
@@ -33259,8 +32898,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
@@ -33889,11 +33527,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -33933,11 +33569,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -33986,7 +33620,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -34041,7 +33674,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -34091,11 +33723,9 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -34146,7 +33776,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -34199,11 +33828,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -34249,11 +33876,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -34308,7 +33933,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -34369,7 +33993,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -34425,11 +34048,9 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -34486,7 +34107,6 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35347,7 +34967,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35393,7 +35012,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35454,7 +35072,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35517,7 +35134,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35569,7 +35185,6 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35632,7 +35247,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35687,7 +35301,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35739,7 +35352,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35806,7 +35418,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35875,7 +35486,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -35933,7 +35543,6 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -36002,7 +35611,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -37177,8 +36785,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -37186,9 +36793,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i6
;
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
@@ -37238,8 +36843,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -37247,9 +36851,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2
;
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -37300,8 +36902,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -37309,9 +36910,7 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <
;
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
@@ -37523,8 +37122,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -37532,9 +37130,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x
;
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
@@ -37584,8 +37180,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -37593,9 +37188,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <
;
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -37646,8 +37239,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -37655,9 +37247,7 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u,
;
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
@@ -37729,11 +37319,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -37770,11 +37358,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -37813,11 +37399,9 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -37839,8 +37423,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
@@ -37853,24 +37436,21 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -37889,8 +37469,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
@@ -37903,24 +37482,21 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -37940,8 +37516,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
@@ -37954,10 +37529,8 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
@@ -37965,14 +37538,13 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -38015,11 +37587,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -38062,11 +37632,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -38111,11 +37679,9 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -38140,8 +37706,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
@@ -38154,10 +37719,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
@@ -38167,16 +37730,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -38196,8 +37758,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
@@ -38210,10 +37771,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
@@ -38223,16 +37782,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -38253,8 +37811,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
@@ -38267,10 +37824,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
@@ -38281,16 +37836,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -39366,7 +38920,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -39406,7 +38959,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -39447,7 +38999,6 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -39491,7 +39042,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -39535,7 +39085,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -39580,7 +39129,6 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -39669,7 +39217,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -39714,7 +39261,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -39760,7 +39306,6 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -39809,7 +39354,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -39858,7 +39402,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -39908,7 +39451,6 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -39991,8 +39533,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
@@ -40019,8 +39560,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
@@ -40048,8 +39588,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
@@ -40078,18 +39617,14 @@ entry:
define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
@@ -40115,18 +39650,14 @@ entry:
define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
@@ -40153,18 +39684,14 @@ entry:
define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
-; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
@@ -40310,8 +39837,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -40319,9 +39845,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i6
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
@@ -40357,8 +39881,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -40366,9 +39889,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
@@ -40405,8 +39926,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -40414,9 +39934,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
@@ -40572,8 +40090,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -40581,9 +40098,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
@@ -40619,8 +40134,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -40628,9 +40142,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
@@ -40667,8 +40179,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -40676,9 +40187,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u,
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
@@ -40736,11 +40245,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -40777,11 +40284,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -40820,11 +40325,9 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
@@ -40846,8 +40349,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
@@ -40860,24 +40362,21 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -40896,8 +40395,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
@@ -40910,24 +40408,21 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -40947,8 +40442,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
@@ -40961,10 +40455,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
@@ -40972,14 +40464,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -41022,11 +40513,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -41069,11 +40558,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -41118,11 +40605,9 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
@@ -41147,8 +40632,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
@@ -41161,10 +40645,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
@@ -41174,16 +40656,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -41203,8 +40684,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
@@ -41217,10 +40697,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
@@ -41230,16 +40708,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -41260,8 +40737,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
@@ -41274,10 +40750,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
@@ -41288,16 +40762,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -41487,8 +40960,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -41497,9 +40969,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i6
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -41550,8 +41020,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -41560,9 +41029,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -41614,8 +41081,7 @@ entry:
define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %al killed %al killed %eax
@@ -41624,9 +41090,7 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
@@ -41848,8 +41312,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -41858,9 +41321,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -41911,8 +41372,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -41921,9 +41381,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -41975,8 +41433,7 @@ entry:
define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: # kill: def %ax killed %ax killed %eax
@@ -41985,9 +41442,7 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u,
;
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
@@ -42062,7 +41517,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -42105,7 +41559,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -42150,7 +41603,6 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -42176,8 +41628,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
@@ -42191,10 +41642,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -42202,14 +41651,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -42228,8 +41676,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
@@ -42243,10 +41690,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -42254,14 +41699,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -42281,8 +41725,7 @@ entry:
define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
@@ -42296,10 +41739,8 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
@@ -42308,14 +41749,13 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: vzeroupper
@@ -42360,7 +41800,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -42409,7 +41848,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -42460,7 +41898,6 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -42489,8 +41926,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
@@ -42504,10 +41940,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -42518,16 +41952,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -42547,8 +41980,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
@@ -42562,10 +41994,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
@@ -42576,16 +42006,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -42606,8 +42035,7 @@ entry:
define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
; VLX: # %bb.0: # %entry
-; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
@@ -42621,10 +42049,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $96, %rsp
-; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
@@ -42636,16 +42062,15 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
-; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
@@ -43830,7 +43255,6 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 78c44e4dca3b3..45af265a95b05 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -31,11 +31,9 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0
; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
@@ -80,8 +78,7 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i32:
@@ -89,8 +86,7 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i32> %a, %b
%x1 = icmp sgt <4 x i32> %c, %d
@@ -123,8 +119,7 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d)
; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k1
; AVX512F-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4f32:
@@ -132,8 +127,7 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d)
; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k1
; AVX512BW-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = fcmp ogt <4 x float> %a, %b
%x1 = fcmp ogt <4 x float> %c, %d
@@ -165,11 +159,9 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
@@ -318,8 +310,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i8:
@@ -335,8 +326,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i8> %a, %b
%x1 = icmp sgt <2 x i8> %c, %d
@@ -473,8 +463,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i16:
@@ -490,8 +479,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i16> %a, %b
%x1 = icmp sgt <2 x i16> %c, %d
@@ -612,8 +600,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i32:
@@ -629,8 +616,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i32> %a, %b
%x1 = icmp sgt <2 x i32> %c, %d
@@ -682,8 +668,7 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i64:
@@ -691,8 +676,7 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i64> %a, %b
%x1 = icmp sgt <2 x i64> %c, %d
@@ -725,8 +709,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double>
; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; AVX512F-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2f64:
@@ -734,8 +717,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double>
; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; AVX512BW-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = fcmp ogt <2 x double> %a, %b
%x1 = fcmp ogt <2 x double> %c, %d
@@ -792,8 +774,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i8:
@@ -809,8 +790,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i8> %a, %b
%x1 = icmp sgt <4 x i8> %c, %d
@@ -867,8 +847,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i16:
@@ -884,8 +863,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i16> %a, %b
%x1 = icmp sgt <4 x i16> %c, %d
@@ -944,10 +922,8 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1
; AVX512F-NEXT: vpmovsxwd %xmm2, %ymm0
-; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index fdce65516e322..62480bb0bd25c 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -94,8 +94,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512F-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -104,8 +103,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512BW-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i64> %a, %b
@@ -148,8 +146,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512F-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -158,8 +155,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512BW-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x0 = fcmp ogt <4 x double> %a, %b
@@ -219,11 +215,9 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 6ef2be99dee52..04cbded7667cd 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -44,10 +44,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
;
; AVX512-LABEL: ext_i2_2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: andb $3, %dil
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -86,10 +83,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) {
;
; AVX512-LABEL: ext_i4_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: andb $15, %dil
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -102,8 +96,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -112,8 +106,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX1-LABEL: ext_i8_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -239,10 +233,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
;
; AVX512-LABEL: ext_i4_4i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: andb $15, %dil
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -305,8 +296,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; SSE2-SSSE3-LABEL: ext_i16_16i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
@@ -319,8 +310,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1-LABEL: ext_i16_16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -565,8 +556,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-LABEL: ext_i32_32i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm2
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
@@ -574,8 +565,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768]
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2
@@ -586,8 +577,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-LABEL: ext_i32_32i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm1
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -599,8 +590,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 9e77cd11449e6..54ba1881f1151 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -49,9 +49,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
; AVX512F-LABEL: ext_i2_2i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: andb $3, %dil
-; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
@@ -59,10 +57,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
;
; AVX512VLBW-LABEL: ext_i2_2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: andb $3, %dil
-; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: kmovd %edi, %k1
; AVX512VLBW-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%1 = bitcast i2 %a0 to <2 x i1>
@@ -104,9 +99,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) {
; AVX512F-LABEL: ext_i4_4i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: andb $15, %dil
-; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
@@ -114,10 +107,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) {
;
; AVX512VLBW-LABEL: ext_i4_4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: andb $15, %dil
-; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: kmovd %edi, %k1
; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%1 = bitcast i4 %a0 to <4 x i1>
@@ -129,8 +119,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -140,8 +130,8 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX1-LABEL: ext_i8_8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
@@ -300,19 +290,14 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX512F-LABEL: ext_i4_4i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: andb $15, %dil
-; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: ext_i4_4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: andb $15, %dil
-; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: kmovd %edi, %k1
; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
%1 = bitcast i4 %a0 to <4 x i1>
@@ -385,8 +370,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; SSE2-SSSE3-LABEL: ext_i16_16i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
@@ -401,8 +386,8 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1-LABEL: ext_i16_16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -723,8 +708,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-LABEL: ext_i32_32i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm2
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
@@ -734,8 +719,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
; SSE2-SSSE3-NEXT: psrlw $15, %xmm1
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2
@@ -748,8 +733,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-LABEL: ext_i32_32i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm1
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -763,8 +748,8 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm2
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index 45a48fae146d4..8af95dfd5b80e 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -43,9 +43,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) {
;
; AVX512-LABEL: bitcast_i2_2i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -86,9 +84,7 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) {
;
; AVX512-LABEL: bitcast_i4_4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -100,8 +96,8 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
; SSE2-SSSE3-LABEL: bitcast_i8_8i1:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movd %edi, %xmm0
-; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
@@ -111,8 +107,8 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
; AVX1-LABEL: bitcast_i8_8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index 8fdacb7b79d61..a96c1a30e67a5 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -27,7 +27,6 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
@@ -64,16 +63,14 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i32> %a, %b
%res = bitcast <4 x i1> %x to i4
@@ -99,16 +96,14 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = fcmp ogt <4 x float> %a, %b
%res = bitcast <4 x i1> %x to i4
@@ -134,7 +129,6 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
@@ -226,8 +220,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i8:
@@ -238,8 +231,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i8> %a, %b
%res = bitcast <2 x i1> %x to i2
@@ -320,8 +312,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i16:
@@ -332,8 +323,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i16> %a, %b
%res = bitcast <2 x i1> %x to i2
@@ -406,8 +396,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i32:
@@ -418,8 +407,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i32> %a, %b
%res = bitcast <2 x i1> %x to i2
@@ -455,16 +443,14 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i64> %a, %b
%res = bitcast <2 x i1> %x to i2
@@ -490,16 +476,14 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2f64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = fcmp ogt <2 x double> %a, %b
%res = bitcast <2 x i1> %x to i2
@@ -537,8 +521,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i8:
@@ -549,8 +532,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i8> %a, %b
%res = bitcast <4 x i1> %x to i4
@@ -588,8 +570,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i16:
@@ -600,8 +581,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i16> %a, %b
%res = bitcast <4 x i1> %x to i4
@@ -641,7 +621,6 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index 48e28c9d26ca4..0398f31f12d48 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -42,7 +42,6 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
@@ -194,11 +193,9 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %eax
@@ -271,8 +268,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -280,8 +276,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i64> %a, %b
@@ -311,8 +306,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -320,8 +314,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x = fcmp ogt <4 x double> %a, %b
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
index 9914f0b934341..f752068acdf00 100644
--- a/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -62,14 +62,12 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %eax
@@ -870,21 +868,17 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl (%rsp), %ecx
diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll
index 8548d8b7677d2..428eaa19497b6 100644
--- a/test/CodeGen/X86/broadcastm-lowering.ll
+++ b/test/CodeGen/X86/broadcastm-lowering.ll
@@ -8,7 +8,6 @@ define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) {
; AVX512CD: # %bb.0: # %entry
; AVX512CD-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX512CD-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512CD-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512CD-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512CD-NEXT: kmovw %k0, %eax
; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -45,7 +44,6 @@ define <4 x i32> @test_mm_epi32(<16 x i8> %a, <16 x i8> %b) {
; AVX512CD: # %bb.0: # %entry
; AVX512CD-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX512CD-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512CD-NEXT: kmovw %k0, %eax
; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -179,7 +177,6 @@ define <8 x i32> @test_mm256_epi32(<16 x i16> %a, <16 x i16> %b) {
; AVX512CD: # %bb.0: # %entry
; AVX512CD-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512CD-NEXT: kmovw %k0, %eax
; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/build-vector-128.ll b/test/CodeGen/X86/build-vector-128.ll
index da92fe6c3fdae..6c0c2d30c312d 100644
--- a/test/CodeGen/X86/build-vector-128.ll
+++ b/test/CodeGen/X86/build-vector-128.ll
@@ -409,3 +409,101 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15
ret <16 x i8> %ins15
}
+
+; PR30780
+
+define <4 x i32> @test_buildvector_v4i32_splat_sext_i8(i8 %in) {
+; SSE-32-LABEL: test_buildvector_v4i32_splat_sext_i8:
+; SSE-32: # %bb.0:
+; SSE-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; SSE-32-NEXT: movd %eax, %xmm0
+; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-32-NEXT: retl
+;
+; SSE-64-LABEL: test_buildvector_v4i32_splat_sext_i8:
+; SSE-64: # %bb.0:
+; SSE-64-NEXT: movsbl %dil, %eax
+; SSE-64-NEXT: movd %eax, %xmm0
+; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-64-NEXT: retq
+;
+; AVX1-32-LABEL: test_buildvector_v4i32_splat_sext_i8:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; AVX1-32-NEXT: vmovd %eax, %xmm0
+; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-32-NEXT: retl
+;
+; AVX1-64-LABEL: test_buildvector_v4i32_splat_sext_i8:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: movsbl %dil, %eax
+; AVX1-64-NEXT: vmovd %eax, %xmm0
+; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-64-NEXT: retq
+;
+; AVX2-32-LABEL: test_buildvector_v4i32_splat_sext_i8:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; AVX2-32-NEXT: vmovd %eax, %xmm0
+; AVX2-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-32-NEXT: retl
+;
+; AVX2-64-LABEL: test_buildvector_v4i32_splat_sext_i8:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: movsbl %dil, %eax
+; AVX2-64-NEXT: vmovd %eax, %xmm0
+; AVX2-64-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-64-NEXT: retq
+ %ext = sext i8 %in to i32
+ %insert = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %splat
+}
+
+define <4 x i32> @test_buildvector_v4i32_splat_zext_i8(i8 %in) {
+; SSE-32-LABEL: test_buildvector_v4i32_splat_zext_i8:
+; SSE-32: # %bb.0:
+; SSE-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; SSE-32-NEXT: movd %eax, %xmm0
+; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-32-NEXT: retl
+;
+; SSE-64-LABEL: test_buildvector_v4i32_splat_zext_i8:
+; SSE-64: # %bb.0:
+; SSE-64-NEXT: movzbl %dil, %eax
+; SSE-64-NEXT: movd %eax, %xmm0
+; SSE-64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-64-NEXT: retq
+;
+; AVX1-32-LABEL: test_buildvector_v4i32_splat_zext_i8:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; AVX1-32-NEXT: vmovd %eax, %xmm0
+; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-32-NEXT: retl
+;
+; AVX1-64-LABEL: test_buildvector_v4i32_splat_zext_i8:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: movzbl %dil, %eax
+; AVX1-64-NEXT: vmovd %eax, %xmm0
+; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-64-NEXT: retq
+;
+; AVX2-32-LABEL: test_buildvector_v4i32_splat_zext_i8:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; AVX2-32-NEXT: vmovd %eax, %xmm0
+; AVX2-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-32-NEXT: retl
+;
+; AVX2-64-LABEL: test_buildvector_v4i32_splat_zext_i8:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: movzbl %dil, %eax
+; AVX2-64-NEXT: vmovd %eax, %xmm0
+; AVX2-64-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-64-NEXT: retq
+ %ext = zext i8 %in to i32
+ %insert = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %splat
+}
diff --git a/test/CodeGen/X86/build-vector-256.ll b/test/CodeGen/X86/build-vector-256.ll
index f2f17710033d7..d2d7a194c7012 100644
--- a/test/CodeGen/X86/build-vector-256.ll
+++ b/test/CodeGen/X86/build-vector-256.ll
@@ -411,3 +411,77 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%ins31 = insertelement <32 x i8> %ins30, i8 %a31, i32 31
ret <32 x i8> %ins31
}
+
+; PR30780
+
+define <8 x i32> @test_buildvector_v8i32_splat_sext_i8(i8 %in) {
+; AVX1-32-LABEL: test_buildvector_v8i32_splat_sext_i8:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; AVX1-32-NEXT: vmovd %eax, %xmm0
+; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-32-NEXT: retl
+;
+; AVX1-64-LABEL: test_buildvector_v8i32_splat_sext_i8:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: movsbl %dil, %eax
+; AVX1-64-NEXT: vmovd %eax, %xmm0
+; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-64-NEXT: retq
+;
+; AVX2-32-LABEL: test_buildvector_v8i32_splat_sext_i8:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; AVX2-32-NEXT: vmovd %eax, %xmm0
+; AVX2-32-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-32-NEXT: retl
+;
+; AVX2-64-LABEL: test_buildvector_v8i32_splat_sext_i8:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: movsbl %dil, %eax
+; AVX2-64-NEXT: vmovd %eax, %xmm0
+; AVX2-64-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-64-NEXT: retq
+ %ext = sext i8 %in to i32
+ %insert = insertelement <8 x i32> undef, i32 %ext, i32 0
+ %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+ ret <8 x i32> %splat
+}
+
+define <8 x i32> @test_buildvector_v8i32_splat_zext_i8(i8 %in) {
+; AVX1-32-LABEL: test_buildvector_v8i32_splat_zext_i8:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; AVX1-32-NEXT: vmovd %eax, %xmm0
+; AVX1-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-32-NEXT: retl
+;
+; AVX1-64-LABEL: test_buildvector_v8i32_splat_zext_i8:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: movzbl %dil, %eax
+; AVX1-64-NEXT: vmovd %eax, %xmm0
+; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-64-NEXT: retq
+;
+; AVX2-32-LABEL: test_buildvector_v8i32_splat_zext_i8:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; AVX2-32-NEXT: vmovd %eax, %xmm0
+; AVX2-32-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-32-NEXT: retl
+;
+; AVX2-64-LABEL: test_buildvector_v8i32_splat_zext_i8:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: movzbl %dil, %eax
+; AVX2-64-NEXT: vmovd %eax, %xmm0
+; AVX2-64-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-64-NEXT: retq
+ %ext = zext i8 %in to i32
+ %insert = insertelement <8 x i32> undef, i32 %ext, i32 0
+ %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+ ret <8 x i32> %splat
+}
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index ee63ec6539188..ff41083835f4f 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -409,11 +409,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE2-LABEL: example24:
; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: movd %esi, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB6_1: # %vector.body
@@ -441,11 +441,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE41-LABEL: example24:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE41-NEXT: movd %esi, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: .LBB6_1: # %vector.body
@@ -470,11 +470,11 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; AVX1-LABEL: example24:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vmovd %esi, %xmm1
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
index 556c858759fc7..afb9aa9411ed6 100644
--- a/test/CodeGen/X86/cvtv2f32.ll
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -5,8 +5,8 @@
; uitofp <2 x i32> codegen from buildvector or legalization is different but gives the same results
; across the full 0 - 0xFFFFFFFF u32 range.
-define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) {
-; X32-LABEL: uitofp_2i32_buildvector:
+define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v) {
+; X32-LABEL: uitofp_2i32_cvt_buildvector:
; X32: # %bb.0:
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
@@ -18,7 +18,7 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) {
; X32-NEXT: mulps %xmm1, %xmm0
; X32-NEXT: retl
;
-; X64-LABEL: uitofp_2i32_buildvector:
+; X64-LABEL: uitofp_2i32_cvt_buildvector:
; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm1
; X64-NEXT: pinsrd $1, %esi, %xmm1
@@ -38,6 +38,37 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) {
ret <2 x float> %t5
}
+define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) {
+; X32-LABEL: uitofp_2i32_buildvector_cvt:
+; X32: # %bb.0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X32-NEXT: movapd {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15]
+; X32-NEXT: orpd %xmm1, %xmm2
+; X32-NEXT: subpd %xmm1, %xmm2
+; X32-NEXT: cvtpd2ps %xmm2, %xmm1
+; X32-NEXT: mulps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: uitofp_2i32_buildvector_cvt:
+; X64: # %bb.0:
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: movd %edi, %xmm2
+; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [4.503600e+15,4.503600e+15]
+; X64-NEXT: por %xmm1, %xmm2
+; X64-NEXT: subpd %xmm1, %xmm2
+; X64-NEXT: cvtpd2ps %xmm2, %xmm1
+; X64-NEXT: mulps %xmm1, %xmm0
+; X64-NEXT: retq
+ %t1 = insertelement <2 x i32> undef, i32 %x, i32 0
+ %t2 = insertelement <2 x i32> %t1, i32 %y, i32 1
+ %t3 = uitofp <2 x i32> %t2 to <2 x float>
+ %t4 = fmul <2 x float> %v, %t3
+ ret <2 x float> %t4
+}
+
define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
; X32-LABEL: uitofp_2i32_legalized:
; X32: # %bb.0:
diff --git a/test/CodeGen/X86/fixup-bw-inst.mir b/test/CodeGen/X86/fixup-bw-inst.mir
index cea483e1b9bc6..e5a5e16108fb5 100644
--- a/test/CodeGen/X86/fixup-bw-inst.mir
+++ b/test/CodeGen/X86/fixup-bw-inst.mir
@@ -26,6 +26,12 @@
ret i16 %i.0
}
+ define i16 @test4() {
+ entry:
+ %t1 = zext i1 undef to i16
+ %t2 = or i16 undef, %t1
+ ret i16 %t2
+ }
...
---
# CHECK-LABEL: name: test1
@@ -149,3 +155,47 @@ body: |
RETQ %ax
...
+---
+# CHECK-LABEL: name: test4
+name: test4
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%r9d' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+# This code copies r10b into r9b and then uses r9w. We would like to promote
+# the copy to a 32-bit copy, but because r9w is used this is not acceptable.
+body: |
+ bb.0.entry:
+ successors:
+ liveins: %r9d
+
+ %r9b = MOV8rr undef %r10b, implicit-def %r9d, implicit killed %r9d, implicit-def %eflags
+ ; CHECK-NOT: MOV32rr
+ %ax = OR16rr undef %ax, %r9w, implicit-def %eflags
+ RETQ %ax
+...
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
index 1c97e8c768ccd..cc434bf18ab31 100644
--- a/test/CodeGen/X86/memset-nonzero.ll
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -206,8 +206,8 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
; SSE2FAST-NEXT: retq
;
@@ -245,8 +245,8 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
; SSE2FAST-NEXT: retq
@@ -292,8 +292,8 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
@@ -351,8 +351,8 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
@@ -400,8 +400,8 @@ define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi)
; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi)
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index df97973aecbd2..50f44419e8230 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -695,7 +695,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3]
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movq %xmm1, (%rdi)
diff --git a/test/CodeGen/X86/pr33349.ll b/test/CodeGen/X86/pr33349.ll
index b1428ba6667c8..8f9c861d9ecf3 100644
--- a/test/CodeGen/X86/pr33349.ll
+++ b/test/CodeGen/X86/pr33349.ll
@@ -40,7 +40,7 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX: # %bb.0: # %bb
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: kshiftrb $2, %k0, %k1
; SKX-NEXT: kshiftrw $1, %k1, %k2
; SKX-NEXT: kmovd %k2, %eax
; SKX-NEXT: testb $1, %al
diff --git a/test/CodeGen/X86/pr35765.ll b/test/CodeGen/X86/pr35765.ll
new file mode 100644
index 0000000000000..4d097459e33ac
--- /dev/null
+++ b/test/CodeGen/X86/pr35765.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s
+
+@ll = local_unnamed_addr global i64 0, align 8
+@x = local_unnamed_addr global i64 2651237805702985558, align 8
+@s1 = local_unnamed_addr global { i8, i8 } { i8 123, i8 5 }, align 2
+@s2 = local_unnamed_addr global { i8, i8 } { i8 -122, i8 3 }, align 2
+
+define void @PR35765() {
+; CHECK-LABEL: PR35765:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx
+; CHECK-NEXT: addl $-1398, %ecx # imm = 0xFA8A
+; CHECK-NEXT: movl $4, %eax
+; CHECK-NEXT: # kill: def %cl killed %cl killed %ecx
+; CHECK-NEXT: shll %cl, %eax
+; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx
+; CHECK-NEXT: movzwl {{.*}}(%rip), %edx
+; CHECK-NEXT: notl %edx
+; CHECK-NEXT: orl $63488, %edx # imm = 0xF800
+; CHECK-NEXT: movzwl %dx, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: xorl %eax, %edx
+; CHECK-NEXT: movslq %edx, %rax
+; CHECK-NEXT: movq %rax, {{.*}}(%rip)
+; CHECK-NEXT: retq
+entry:
+ %bf.load.i = load i16, i16* bitcast ({ i8, i8 }* @s1 to i16*), align 2
+ %bf.clear.i = and i16 %bf.load.i, 2047
+ %conv.i = zext i16 %bf.clear.i to i32
+ %sub.i = add nsw i32 %conv.i, -1398
+ %shl.i = shl i32 4, %sub.i
+ %0 = load i64, i64* @x, align 8
+ %bf.load1.i = load i16, i16* bitcast ({ i8, i8 }* @s2 to i16*), align 2
+ %bf.clear2.i = and i16 %bf.load1.i, 2047
+ %1 = xor i16 %bf.clear2.i, -1
+ %neg.i = zext i16 %1 to i64
+ %or.i = or i64 %0, %neg.i
+ %conv5.i = trunc i64 %or.i to i32
+ %conv6.i = and i32 %conv5.i, 65535
+ %xor.i = xor i32 %conv6.i, %shl.i
+ %conv7.i = sext i32 %xor.i to i64
+ store i64 %conv7.i, i64* @ll, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index 8642bc596f39b..490c232a161c8 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -54,16 +54,16 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
; SSE-LABEL: test3:
; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movd %edi, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE-NEXT: psubusw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm1
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -137,8 +137,8 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: psubusb %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -267,8 +267,8 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; SSE-LABEL: test9:
; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movd %edi, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; SSE-NEXT: psubusw %xmm2, %xmm0
; SSE-NEXT: psubusw %xmm2, %xmm1
; SSE-NEXT: retq
@@ -277,8 +277,8 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovd %edi, %xmm2
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
@@ -392,8 +392,8 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movd %edi, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; SSE2-NEXT: psubusb %xmm2, %xmm0
; SSE2-NEXT: psubusb %xmm2, %xmm1
; SSE2-NEXT: retq
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index 410378ffbad29..1163307a0c34b 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -1,6 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX256 --check-prefix=AVX512BW
; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization.
@@ -14,14 +17,14 @@ define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: ne_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
+; AVXANY-LABEL: ne_i128:
+; AVXANY: # %bb.0:
+; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
+; AVXANY-NEXT: xorl %eax, %eax
+; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; AVXANY-NEXT: setne %al
+; AVXANY-NEXT: retq
%bcx = bitcast <2 x i64> %x to i128
%bcy = bitcast <2 x i64> %y to i128
%cmp = icmp ne i128 %bcx, %bcy
@@ -39,14 +42,14 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: eq_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: retq
+; AVXANY-LABEL: eq_i128:
+; AVXANY: # %bb.0:
+; AVXANY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
+; AVXANY-NEXT: xorl %eax, %eax
+; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; AVXANY-NEXT: sete %al
+; AVXANY-NEXT: retq
%bcx = bitcast <2 x i64> %x to i128
%bcy = bitcast <2 x i64> %y to i128
%cmp = icmp eq i128 %bcx, %bcy
@@ -80,15 +83,39 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: ne_i256:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpmovmskb %ymm0, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: cmpl $-1, %ecx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX1-LABEL: ne_i256:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovq %xmm2, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: vpextrq $1, %xmm2, %r8
+; AVX1-NEXT: vmovq %xmm1, %rdi
+; AVX1-NEXT: xorq %rax, %rdi
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rsi
+; AVX1-NEXT: xorq %rcx, %rsi
+; AVX1-NEXT: orq %rdi, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: xorq %rdx, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: xorq %r8, %rcx
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: orq %rsi, %rcx
+; AVX1-NEXT: setne %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX256-LABEL: ne_i256:
+; AVX256: # %bb.0:
+; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX256-NEXT: vpmovmskb %ymm0, %ecx
+; AVX256-NEXT: xorl %eax, %eax
+; AVX256-NEXT: cmpl $-1, %ecx
+; AVX256-NEXT: setne %al
+; AVX256-NEXT: vzeroupper
+; AVX256-NEXT: retq
%bcx = bitcast <4 x i64> %x to i256
%bcy = bitcast <4 x i64> %y to i256
%cmp = icmp ne i256 %bcx, %bcy
@@ -122,15 +149,39 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: eq_i256:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpmovmskb %ymm0, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: cmpl $-1, %ecx
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX1-LABEL: eq_i256:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovq %xmm2, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: vpextrq $1, %xmm2, %r8
+; AVX1-NEXT: vmovq %xmm1, %rdi
+; AVX1-NEXT: xorq %rax, %rdi
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rsi
+; AVX1-NEXT: xorq %rcx, %rsi
+; AVX1-NEXT: orq %rdi, %rsi
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: xorq %rdx, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: xorq %r8, %rcx
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: orq %rsi, %rcx
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX256-LABEL: eq_i256:
+; AVX256: # %bb.0:
+; AVX256-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX256-NEXT: vpmovmskb %ymm0, %ecx
+; AVX256-NEXT: xorl %eax, %eax
+; AVX256-NEXT: cmpl $-1, %ecx
+; AVX256-NEXT: sete %al
+; AVX256-NEXT: vzeroupper
+; AVX256-NEXT: retq
%bcx = bitcast <4 x i64> %x to i256
%bcy = bitcast <4 x i64> %y to i256
%cmp = icmp eq i256 %bcx, %bcy
@@ -138,43 +189,37 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
ret i32 %zext
}
-; This test models the expansion of 'memcmp(a, b, 32) != 0'
+; This test models the expansion of 'memcmp(a, b, 32) != 0'
; if we allowed 2 pairs of 16-byte loads per block.
define i32 @ne_i128_pair(i128* %a, i128* %b) {
; SSE2-LABEL: ne_i128_pair:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq (%rdi), %rax
-; SSE2-NEXT: movq 8(%rdi), %rcx
-; SSE2-NEXT: xorq (%rsi), %rax
-; SSE2-NEXT: xorq 8(%rsi), %rcx
-; SSE2-NEXT: movq 24(%rdi), %rdx
-; SSE2-NEXT: movq 16(%rdi), %rdi
-; SSE2-NEXT: xorq 16(%rsi), %rdi
-; SSE2-NEXT: orq %rax, %rdi
-; SSE2-NEXT: xorq 24(%rsi), %rdx
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; SSE2-NEXT: movdqu (%rsi), %xmm2
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE2-NEXT: movdqu 16(%rsi), %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdi, %rdx
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: ne_i128_pair:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: xorq (%rsi), %rax
-; AVX2-NEXT: xorq 8(%rsi), %rcx
-; AVX2-NEXT: movq 24(%rdi), %rdx
-; AVX2-NEXT: movq 16(%rdi), %rdi
-; AVX2-NEXT: xorq 16(%rsi), %rdi
-; AVX2-NEXT: orq %rax, %rdi
-; AVX2-NEXT: xorq 24(%rsi), %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: orq %rdi, %rdx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
+; AVXANY-LABEL: ne_i128_pair:
+; AVXANY: # %bb.0:
+; AVXANY-NEXT: vmovdqu (%rdi), %xmm0
+; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1
+; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
+; AVXANY-NEXT: xorl %eax, %eax
+; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; AVXANY-NEXT: setne %al
+; AVXANY-NEXT: retq
%a0 = load i128, i128* %a
%b0 = load i128, i128* %b
%xor1 = xor i128 %a0, %b0
@@ -189,43 +234,37 @@ define i32 @ne_i128_pair(i128* %a, i128* %b) {
ret i32 %z
}
-; This test models the expansion of 'memcmp(a, b, 32) == 0'
+; This test models the expansion of 'memcmp(a, b, 32) == 0'
; if we allowed 2 pairs of 16-byte loads per block.
define i32 @eq_i128_pair(i128* %a, i128* %b) {
; SSE2-LABEL: eq_i128_pair:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq (%rdi), %rax
-; SSE2-NEXT: movq 8(%rdi), %rcx
-; SSE2-NEXT: xorq (%rsi), %rax
-; SSE2-NEXT: xorq 8(%rsi), %rcx
-; SSE2-NEXT: movq 24(%rdi), %rdx
-; SSE2-NEXT: movq 16(%rdi), %rdi
-; SSE2-NEXT: xorq 16(%rsi), %rdi
-; SSE2-NEXT: orq %rax, %rdi
-; SSE2-NEXT: xorq 24(%rsi), %rdx
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; SSE2-NEXT: movdqu (%rsi), %xmm2
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE2-NEXT: movdqu 16(%rsi), %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdi, %rdx
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: eq_i128_pair:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: xorq (%rsi), %rax
-; AVX2-NEXT: xorq 8(%rsi), %rcx
-; AVX2-NEXT: movq 24(%rdi), %rdx
-; AVX2-NEXT: movq 16(%rdi), %rdi
-; AVX2-NEXT: xorq 16(%rsi), %rdi
-; AVX2-NEXT: orq %rax, %rdi
-; AVX2-NEXT: xorq 24(%rsi), %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: orq %rdi, %rdx
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: retq
+; AVXANY-LABEL: eq_i128_pair:
+; AVXANY: # %bb.0:
+; AVXANY-NEXT: vmovdqu (%rdi), %xmm0
+; AVXANY-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVXANY-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1
+; AVXANY-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; AVXANY-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVXANY-NEXT: vpmovmskb %xmm0, %ecx
+; AVXANY-NEXT: xorl %eax, %eax
+; AVXANY-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; AVXANY-NEXT: sete %al
+; AVXANY-NEXT: retq
%a0 = load i128, i128* %a
%b0 = load i128, i128* %b
%xor1 = xor i128 %a0, %b0
@@ -240,7 +279,7 @@ define i32 @eq_i128_pair(i128* %a, i128* %b) {
ret i32 %z
}
-; This test models the expansion of 'memcmp(a, b, 64) != 0'
+; This test models the expansion of 'memcmp(a, b, 64) != 0'
; if we allowed 2 pairs of 32-byte loads per block.
define i32 @ne_i256_pair(i256* %a, i256* %b) {
@@ -273,34 +312,48 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) {
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: ne_i256_pair:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movq 16(%rdi), %r9
-; AVX2-NEXT: movq 24(%rdi), %r11
-; AVX2-NEXT: movq (%rdi), %r8
-; AVX2-NEXT: movq 8(%rdi), %r10
-; AVX2-NEXT: xorq 8(%rsi), %r10
-; AVX2-NEXT: xorq 24(%rsi), %r11
-; AVX2-NEXT: xorq (%rsi), %r8
-; AVX2-NEXT: xorq 16(%rsi), %r9
-; AVX2-NEXT: movq 48(%rdi), %rdx
-; AVX2-NEXT: movq 32(%rdi), %rax
-; AVX2-NEXT: movq 56(%rdi), %rcx
-; AVX2-NEXT: movq 40(%rdi), %rdi
-; AVX2-NEXT: xorq 40(%rsi), %rdi
-; AVX2-NEXT: xorq 56(%rsi), %rcx
-; AVX2-NEXT: orq %r11, %rcx
-; AVX2-NEXT: orq %rdi, %rcx
-; AVX2-NEXT: orq %r10, %rcx
-; AVX2-NEXT: xorq 32(%rsi), %rax
-; AVX2-NEXT: xorq 48(%rsi), %rdx
-; AVX2-NEXT: orq %r9, %rdx
-; AVX2-NEXT: orq %rax, %rdx
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
+; AVX1-LABEL: ne_i256_pair:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq 16(%rdi), %r9
+; AVX1-NEXT: movq 24(%rdi), %r11
+; AVX1-NEXT: movq (%rdi), %r8
+; AVX1-NEXT: movq 8(%rdi), %r10
+; AVX1-NEXT: xorq 8(%rsi), %r10
+; AVX1-NEXT: xorq 24(%rsi), %r11
+; AVX1-NEXT: xorq (%rsi), %r8
+; AVX1-NEXT: xorq 16(%rsi), %r9
+; AVX1-NEXT: movq 48(%rdi), %rdx
+; AVX1-NEXT: movq 32(%rdi), %rax
+; AVX1-NEXT: movq 56(%rdi), %rcx
+; AVX1-NEXT: movq 40(%rdi), %rdi
+; AVX1-NEXT: xorq 40(%rsi), %rdi
+; AVX1-NEXT: xorq 56(%rsi), %rcx
+; AVX1-NEXT: orq %r11, %rcx
+; AVX1-NEXT: orq %rdi, %rcx
+; AVX1-NEXT: orq %r10, %rcx
+; AVX1-NEXT: xorq 32(%rsi), %rax
+; AVX1-NEXT: xorq 48(%rsi), %rdx
+; AVX1-NEXT: orq %r9, %rdx
+; AVX1-NEXT: orq %rax, %rdx
+; AVX1-NEXT: orq %r8, %rdx
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: setne %al
+; AVX1-NEXT: retq
+;
+; AVX256-LABEL: ne_i256_pair:
+; AVX256: # %bb.0:
+; AVX256-NEXT: vmovdqu (%rdi), %ymm0
+; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
+; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
+; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX256-NEXT: vpmovmskb %ymm0, %ecx
+; AVX256-NEXT: xorl %eax, %eax
+; AVX256-NEXT: cmpl $-1, %ecx
+; AVX256-NEXT: setne %al
+; AVX256-NEXT: vzeroupper
+; AVX256-NEXT: retq
%a0 = load i256, i256* %a
%b0 = load i256, i256* %b
%xor1 = xor i256 %a0, %b0
@@ -315,7 +368,7 @@ define i32 @ne_i256_pair(i256* %a, i256* %b) {
ret i32 %z
}
-; This test models the expansion of 'memcmp(a, b, 64) == 0'
+; This test models the expansion of 'memcmp(a, b, 64) == 0'
; if we allowed 2 pairs of 32-byte loads per block.
define i32 @eq_i256_pair(i256* %a, i256* %b) {
@@ -348,34 +401,48 @@ define i32 @eq_i256_pair(i256* %a, i256* %b) {
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
-; AVX2-LABEL: eq_i256_pair:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movq 16(%rdi), %r9
-; AVX2-NEXT: movq 24(%rdi), %r11
-; AVX2-NEXT: movq (%rdi), %r8
-; AVX2-NEXT: movq 8(%rdi), %r10
-; AVX2-NEXT: xorq 8(%rsi), %r10
-; AVX2-NEXT: xorq 24(%rsi), %r11
-; AVX2-NEXT: xorq (%rsi), %r8
-; AVX2-NEXT: xorq 16(%rsi), %r9
-; AVX2-NEXT: movq 48(%rdi), %rdx
-; AVX2-NEXT: movq 32(%rdi), %rax
-; AVX2-NEXT: movq 56(%rdi), %rcx
-; AVX2-NEXT: movq 40(%rdi), %rdi
-; AVX2-NEXT: xorq 40(%rsi), %rdi
-; AVX2-NEXT: xorq 56(%rsi), %rcx
-; AVX2-NEXT: orq %r11, %rcx
-; AVX2-NEXT: orq %rdi, %rcx
-; AVX2-NEXT: orq %r10, %rcx
-; AVX2-NEXT: xorq 32(%rsi), %rax
-; AVX2-NEXT: xorq 48(%rsi), %rdx
-; AVX2-NEXT: orq %r9, %rdx
-; AVX2-NEXT: orq %rax, %rdx
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: retq
+; AVX1-LABEL: eq_i256_pair:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq 16(%rdi), %r9
+; AVX1-NEXT: movq 24(%rdi), %r11
+; AVX1-NEXT: movq (%rdi), %r8
+; AVX1-NEXT: movq 8(%rdi), %r10
+; AVX1-NEXT: xorq 8(%rsi), %r10
+; AVX1-NEXT: xorq 24(%rsi), %r11
+; AVX1-NEXT: xorq (%rsi), %r8
+; AVX1-NEXT: xorq 16(%rsi), %r9
+; AVX1-NEXT: movq 48(%rdi), %rdx
+; AVX1-NEXT: movq 32(%rdi), %rax
+; AVX1-NEXT: movq 56(%rdi), %rcx
+; AVX1-NEXT: movq 40(%rdi), %rdi
+; AVX1-NEXT: xorq 40(%rsi), %rdi
+; AVX1-NEXT: xorq 56(%rsi), %rcx
+; AVX1-NEXT: orq %r11, %rcx
+; AVX1-NEXT: orq %rdi, %rcx
+; AVX1-NEXT: orq %r10, %rcx
+; AVX1-NEXT: xorq 32(%rsi), %rax
+; AVX1-NEXT: xorq 48(%rsi), %rdx
+; AVX1-NEXT: orq %r9, %rdx
+; AVX1-NEXT: orq %rax, %rdx
+; AVX1-NEXT: orq %r8, %rdx
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: retq
+;
+; AVX256-LABEL: eq_i256_pair:
+; AVX256: # %bb.0:
+; AVX256-NEXT: vmovdqu (%rdi), %ymm0
+; AVX256-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX256-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
+; AVX256-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
+; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX256-NEXT: vpmovmskb %ymm0, %ecx
+; AVX256-NEXT: xorl %eax, %eax
+; AVX256-NEXT: cmpl $-1, %ecx
+; AVX256-NEXT: sete %al
+; AVX256-NEXT: vzeroupper
+; AVX256-NEXT: retq
%a0 = load i256, i256* %a
%b0 = load i256, i256* %b
%xor1 = xor i256 %a0, %b0
diff --git a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
index a65c1d312aa40..f7f9dff9beb08 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -2,17 +2,6 @@
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
-; SSE-LABEL: test_x86_sse_storeu_ps:
-; SSE: ## %bb.0:
-; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT: movups %xmm0, (%eax)
-; SSE-NEXT: retl
-;
-; KNL-LABEL: test_x86_sse_storeu_ps:
-; KNL: ## %bb.0:
-; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL-NEXT: vmovups %xmm0, (%eax)
-; KNL-NEXT: retl
; CHECK-LABEL: test_x86_sse_storeu_ps:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -25,20 +14,6 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: test_x86_sse_add_ss:
-; SSE: ## %bb.0:
-; SSE-NEXT: addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_add_ss:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_add_ss:
-; SKX: ## %bb.0:
-; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x58,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_add_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: addss %xmm1, %xmm0
@@ -50,20 +25,6 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: test_x86_sse_sub_ss:
-; SSE: ## %bb.0:
-; SSE-NEXT: subss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5c,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_sub_ss:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5c,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_sub_ss:
-; SKX: ## %bb.0:
-; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5c,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_sub_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: subss %xmm1, %xmm0
@@ -75,20 +36,6 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: test_x86_sse_mul_ss:
-; SSE: ## %bb.0:
-; SSE-NEXT: mulss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x59,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_mul_ss:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x59,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_mul_ss:
-; SKX: ## %bb.0:
-; SKX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x59,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_mul_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: mulss %xmm1, %xmm0
@@ -100,20 +47,6 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: test_x86_sse_div_ss:
-; SSE: ## %bb.0:
-; SSE-NEXT: divss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5e,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_div_ss:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5e,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_div_ss:
-; SKX: ## %bb.0:
-; SKX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5e,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_div_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: divss %xmm1, %xmm0
@@ -123,4 +56,3 @@ define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
}
declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
-
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index a75a0597325d1..1acf1ad43f6de 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -2364,8 +2364,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_epi8:
@@ -2373,8 +2373,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
%res0 = insertelement <16 x i8> undef, i8 %a0, i32 0
%res1 = insertelement <16 x i8> %res0, i8 %a0, i32 1
@@ -2401,15 +2401,15 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
-; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_epi16:
; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
%res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
%res1 = insertelement <8 x i16> %res0, i16 %a0, i32 1
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 3571e2968bf84..3dd3be6853f04 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -184,20 +184,6 @@ define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) {
declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
-; SSE-LABEL: test_x86_sse2_add_sd:
-; SSE: ## %bb.0:
-; SSE-NEXT: addsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x58,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_add_sd:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x58,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_add_sd:
-; SKX: ## %bb.0:
-; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_add_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: addsd %xmm1, %xmm0
@@ -209,20 +195,6 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
-; SSE-LABEL: test_x86_sse2_sub_sd:
-; SSE: ## %bb.0:
-; SSE-NEXT: subsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5c,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_sub_sd:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5c,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_sub_sd:
-; SKX: ## %bb.0:
-; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_sub_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: subsd %xmm1, %xmm0
@@ -234,20 +206,6 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
-; SSE-LABEL: test_x86_sse2_mul_sd:
-; SSE: ## %bb.0:
-; SSE-NEXT: mulsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x59,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_mul_sd:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x59,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_mul_sd:
-; SKX: ## %bb.0:
-; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_mul_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: mulsd %xmm1, %xmm0
@@ -259,20 +217,6 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
-; SSE-LABEL: test_x86_sse2_div_sd:
-; SSE: ## %bb.0:
-; SSE-NEXT: divsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5e,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_div_sd:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5e,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_div_sd:
-; SKX: ## %bb.0:
-; SKX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_div_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: divsd %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index bdfc96ba97d5f..51f228b414ec0 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -919,12 +919,10 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
;
; AVX512DQ-LABEL: fptosi_2f32_to_2i64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64:
@@ -1448,12 +1446,10 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
;
; AVX512DQ-LABEL: fptoui_2f32_to_2i64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64:
diff --git a/test/CodeGen/X86/vec_set-H.ll b/test/CodeGen/X86/vec_set-H.ll
index 03324f02a4fef..d96c8bbc61718 100644
--- a/test/CodeGen/X86/vec_set-H.ll
+++ b/test/CodeGen/X86/vec_set-H.ll
@@ -5,8 +5,8 @@ define <2 x i64> @doload64(i16 signext %x) nounwind {
; CHECK-LABEL: doload64:
; CHECK: # %bb.0:
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: retl
%tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0
%tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index 2178eb70cdecc..2cbf306c8ba00 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -5459,38 +5459,30 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm4
-; AVX512F-NEXT: vpslld $31, %zmm4, %zmm4
; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0
; AVX512F-NEXT: kmovw %k0, 14(%rdi)
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512F-NEXT: kmovw %k0, 12(%rdi)
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512F-NEXT: kmovw %k0, 10(%rdi)
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, 8(%rdi)
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, 6(%rdi)
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, 4(%rdi)
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, 2(%rdi)
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rdi)
; AVX512F-NEXT: movq %rdi, %rax
@@ -5505,38 +5497,30 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX512DQ-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm4
-; AVX512DQ-NEXT: vpslld $31, %zmm4, %zmm4
; AVX512DQ-NEXT: vptestmd %zmm4, %zmm4, %k0
; AVX512DQ-NEXT: kmovw %k0, 14(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512DQ-NEXT: kmovw %k0, 12(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
; AVX512DQ-NEXT: kmovw %k0, 10(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512DQ-NEXT: kmovw %k0, 8(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
; AVX512DQ-NEXT: kmovw %k0, 6(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, 4(%rdi)
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: movq %rdi, %rax
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index 782c72e2a4d4f..b2c0a4d096c2b 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
; Lower common integer comparisons such as 'isPositive' efficiently:
; https://llvm.org/bugs/show_bug.cgi?id=26701
@@ -84,50 +84,13 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
}
define <1 x i128> @test_strange_type(<1 x i128> %x) {
-; SSE2-LABEL: test_strange_type:
-; SSE2: # %bb.0:
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: movq %rsi, %xmm0
-; SSE2-NEXT: notq %rsi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: movq %rsi, %rdx
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: test_strange_type:
-; SSE42: # %bb.0:
-; SSE42-NEXT: sarq $63, %rsi
-; SSE42-NEXT: movq %rsi, %xmm0
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE42-NEXT: pxor %xmm0, %xmm1
-; SSE42-NEXT: movq %xmm1, %rax
-; SSE42-NEXT: pextrq $1, %xmm1, %rdx
-; SSE42-NEXT: retq
-;
-; AVX1-LABEL: test_strange_type:
-; AVX1: # %bb.0:
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_strange_type:
-; AVX2: # %bb.0:
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX2-NEXT: retq
+; CHECK-LABEL: test_strange_type:
+; CHECK: # %bb.0:
+; CHECK-NEXT: sarq $63, %rsi
+; CHECK-NEXT: notq %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: retq
%sign = ashr <1 x i128> %x, <i128 127>
%not = xor <1 x i128> %sign, <i128 -1>
ret <1 x i128> %not
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index ca670f40ab3fb..8aa8682b6e441 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -796,8 +796,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE2-NEXT: psllw $5, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
@@ -1011,8 +1011,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; X32-SSE-NEXT: psllw $5, %xmm3
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 890cedf97c9dc..1e5dbea6bc618 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -663,8 +663,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -816,8 +816,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 9481e46c0c52e..724fd3454eec4 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -612,8 +612,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -758,8 +758,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 2f5a2b1161159..2ff7ef4328f53 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -12,8 +12,8 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -206,14 +206,14 @@ define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(
define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_0101010101010101:
; SSE: # %bb.0:
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_0101010101010101:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101:
@@ -257,8 +257,8 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -372,14 +372,12 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
@@ -1181,21 +1179,20 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm4, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: psrlq $16, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
+; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psrlq $16, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,1,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
@@ -1203,7 +1200,7 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
; SSE2-NEXT: packuswb %xmm5, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
@@ -1489,8 +1486,8 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
@@ -1531,8 +1528,8 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
@@ -1576,8 +1573,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
@@ -1614,8 +1611,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
@@ -1653,8 +1650,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
@@ -1706,8 +1703,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
@@ -1761,15 +1758,14 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
; SSE2-NEXT: shll $8, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: movzwl %cx, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3]
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7]
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -1813,13 +1809,13 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y)
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: movzbl (%rsi), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 072d71fae570a..fc22040578b14 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -63,14 +63,14 @@ define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_00000000:
; SSE: # %bb.0:
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_00000000:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_00000000:
@@ -1123,33 +1123,44 @@ define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
-; SSE-LABEL: shuffle_v8i16_0213cedf:
-; SSE: # %bb.0:
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
+; SSE2-LABEL: shuffle_v8i16_0213cedf:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0213cedf:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7]
+; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7]
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_0213cedf:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_0213cedf:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v8i16_0213cedf:
; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15]
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: retq
@@ -1157,14 +1168,14 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15]
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,12,13,10,11,14,15]
; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-FAST-NEXT: retq
@@ -2111,79 +2122,115 @@ define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
}
define <8 x i16> @shuffle_v8i16_01100110(<8 x i16> %a) {
-; SSE2-LABEL: shuffle_v8i16_01100110:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_01100110:
+; SSE: # %bb.0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: retq
;
-; SSSE3-LABEL: shuffle_v8i16_01100110:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
-; SSSE3-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_01100110:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: retq
;
-; SSE41-LABEL: shuffle_v8i16_01100110:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
-; SSE41-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8i16_01100110:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-SLOW-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_01100110:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
-; AVX-NEXT: retq
+; AVX2-FAST-LABEL: shuffle_v8i16_01100110:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_01100110:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_01100110:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 1, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_01u0u110(<8 x i16> %a) {
-; SSE2-LABEL: shuffle_v8i16_01u0u110:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_01u0u110:
+; SSE: # %bb.0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: retq
;
-; SSSE3-LABEL: shuffle_v8i16_01u0u110:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1]
-; SSSE3-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_01u0u110:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: retq
;
-; SSE41-LABEL: shuffle_v8i16_01u0u110:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1]
-; SSE41-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8i16_01u0u110:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-SLOW-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_01u0u110:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1]
-; AVX-NEXT: retq
+; AVX2-FAST-LABEL: shuffle_v8i16_01u0u110:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_01u0u110:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_01u0u110:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 undef, i32 0, i32 undef, i32 1, i32 1, i32 0>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_467uu675(<8 x i16> %a) {
-; SSE2-LABEL: shuffle_v8i16_467uu675:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
-; SSE2-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_467uu675:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: retq
;
-; SSSE3-LABEL: shuffle_v8i16_467uu675:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
-; SSSE3-NEXT: retq
+; AVX1-LABEL: shuffle_v8i16_467uu675:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT: retq
;
-; SSE41-LABEL: shuffle_v8i16_467uu675:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
-; SSE41-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v8i16_467uu675:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: retq
;
-; AVX-LABEL: shuffle_v8i16_467uu675:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
-; AVX-NEXT: retq
+; AVX2-FAST-LABEL: shuffle_v8i16_467uu675:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_467uu675:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i16_467uu675:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7, i32 5>
ret <8 x i16> %shuffle
}
@@ -2471,15 +2518,15 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
; SSE-LABEL: insert_dup_mem_v8i16_i32:
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i32:
@@ -2498,8 +2545,8 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # %bb.0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16:
@@ -2547,15 +2594,15 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
; SSE-LABEL: insert_dup_elt1_mem_v8i16_i32:
; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32:
@@ -2574,8 +2621,8 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
@@ -2612,8 +2659,8 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # %bb.0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
@@ -2665,8 +2712,8 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index cbd1b83a4eb2f..aafc9fc7dcd21 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -8,8 +8,8 @@
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -24,17 +24,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <16 x i16> %shuffle
}
@@ -42,17 +63,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <16 x i16> %shuffle
}
@@ -60,17 +102,38 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
}
@@ -78,8 +141,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -96,8 +159,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -114,8 +177,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -132,8 +195,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -152,9 +215,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -162,8 +226,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: retq
@@ -191,9 +255,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -220,8 +285,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -254,8 +319,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -288,8 +353,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -314,8 +379,8 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -340,8 +405,8 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -366,8 +431,8 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -390,18 +455,18 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
@@ -411,8 +476,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
@@ -426,18 +491,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
@@ -447,8 +512,8 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
@@ -606,15 +671,36 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <16 x i16> %shuffle
}
@@ -622,15 +708,36 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <16 x i16> %shuffle
}
@@ -638,15 +745,36 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
}
@@ -913,8 +1041,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-SLOW-NEXT: retq
;
@@ -965,16 +1093,14 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1
define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1014,20 +1140,12 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
; AVX512VL: # %bb.0:
@@ -1042,17 +1160,35 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,1,0,4,5,6,7,8,8,9,8,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
ret <16 x i16> %shuffle
}
@@ -1060,17 +1196,35 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,2,4,5,6,7,8,8,8,10,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
ret <16 x i16> %shuffle
}
@@ -1078,17 +1232,35 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,0,4,5,6,7,8,8,11,8,12,13,14,15]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
@@ -1256,9 +1428,11 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1273,9 +1447,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1290,9 +1466,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,3,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1411,8 +1589,8 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1
define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1464,8 +1642,8 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_0
define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
@@ -2036,9 +2214,10 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -2046,8 +2225,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: retq
@@ -3261,15 +3440,25 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5,16,17,18,19,18,19,16,17,24,25,26,27,22,23,20,21]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,6,7,4,5,4,5,6,7,18,19,16,17,20,21,22,23,22,23,20,21,20,21,22,23]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,2,3,0,1,8,9,10,11,6,7,4,5,18,19,16,17,18,19,16,17,24,25,26,27,22,23,20,21]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,12,13,14,15,18,19,16,17,22,23,20,21,22,23,20,21,28,29,30,31]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
; AVX512VL: # %bb.0:
@@ -3344,17 +3533,16 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1
define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,5,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7]
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,4,5,14,15,0,1,4,5,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
@@ -4035,15 +4223,15 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16>
define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
@@ -4055,8 +4243,8 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-SLOW-NEXT: retq
;
@@ -4073,8 +4261,8 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -4099,14 +4287,14 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16>
define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
@@ -4116,8 +4304,8 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a,
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
@@ -4132,15 +4320,15 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
@@ -4152,8 +4340,8 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
@@ -4236,17 +4424,35 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,7,5]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11,24,25,28,29,30,31,30,31,24,25,28,29,30,31,26,27]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27]
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,5,8,9,10,11,12,14,15,13]
+; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,10,11,8,9,12,13,14,15,10,11,24,25,28,29,30,31,26,27,24,25,28,29,30,31,26,27]
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 4, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7, i32 5, i32 12, i32 14, i32 15, i32 undef, i32 undef, i32 14, i32 15, i32 13>
ret <16 x i16> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 51ef3a18438f4..948cfd10076d7 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1299,8 +1299,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1317,13 +1317,13 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
@@ -1331,8 +1331,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: retq
@@ -1348,8 +1348,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
;
; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
; AVX512VLBW-SLOW: # %bb.0:
-; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
-; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
+; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX512VLBW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-SLOW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1
@@ -1896,7 +1896,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -2540,6 +2540,34 @@ define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
ret <32 x i8> %shuffle
}
+; PR33740
+define <32 x i8> @shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23, i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
+ ret <32 x i8> %shuffle
+}
+
define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
; AVX1: # %bb.0:
@@ -2577,7 +2605,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2586,7 +2614,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,14,14,15,15]
; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index f3433ce834cd5..b41fcbe79b0ca 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -16,7 +16,6 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
@@ -51,7 +50,6 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512VL-NEXT: movq $-1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
@@ -85,7 +83,6 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
@@ -110,7 +107,6 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
@@ -125,7 +121,6 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
@@ -157,8 +152,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vpslld $31, %zmm2, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -172,8 +166,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; AVX512VL-NEXT: vpslld $31, %zmm2, %zmm0
-; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -374,8 +367,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
@@ -389,8 +381,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
-; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
; AVX512VL-NEXT: vzeroupper
@@ -422,8 +413,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
@@ -437,7 +427,6 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
-; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
@@ -470,8 +459,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
@@ -484,7 +472,6 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
@@ -518,8 +505,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
@@ -534,7 +520,6 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
@@ -566,7 +551,6 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
@@ -578,7 +562,6 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; AVX512VL-NEXT: kmovw %edi, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
-; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %ax killed %ax killed %eax
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index fd4c30fb327bc..56395309897f0 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -1852,6 +1852,83 @@ entry:
ret <16 x i8> %1
}
+define <8 x i16> @PR32160(<8 x i32> %x) {
+; SSE2-LABEL: PR32160:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: PR32160:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: PR32160:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: PR32160:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR32160:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: PR32160:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: PR32160:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: PR32160:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: PR32160:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %shuf = trunc <8 x i32> %x to <8 x i16>
+ %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i16> %trunc
+}
+
define void @PR34773(i16* %a0, i8* %a1) {
; SSE-LABEL: PR34773:
; SSE: # %bb.0:
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index 96e97c70dbf4d..bd38d901cc8ba 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -304,14 +304,14 @@ entry:
define <16 x i8> @load_splat_16i8_16i8_0101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i8_16i8_0101010101010101:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_16i8_16i8_0101010101010101:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_16i8_16i8_0101010101010101:
@@ -384,15 +384,15 @@ entry:
define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -459,15 +459,15 @@ entry:
define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 792bbbed52e14..e02258a788a11 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -645,9 +645,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
-; AVX512-NEXT: vpsllw $7, %xmm3, %xmm1
-; AVX512-NEXT: vpmovb2m %zmm1, %k0
-; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT: vpmovb2m %zmm3, %k0
; AVX512-NEXT: vpmovb2m %zmm0, %k1
; AVX512-NEXT: kxnorw %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
@@ -958,9 +956,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
-; AVX512-NEXT: vpsllw $7, %ymm8, %ymm1
-; AVX512-NEXT: vpmovb2m %zmm1, %k0
-; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512-NEXT: vpmovb2m %zmm8, %k0
; AVX512-NEXT: vpmovb2m %zmm0, %k1
; AVX512-NEXT: kxnord %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0