diff options
Diffstat (limited to 'test/CodeGen/X86/vector-trunc-math.ll')
-rw-r--r-- | test/CodeGen/X86/vector-trunc-math.ll | 623 |
1 files changed, 364 insertions, 259 deletions
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index f828ed0ba6e7..ab34ad6a613c 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -42,6 +42,7 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -99,7 +100,7 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -109,6 +110,7 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -143,7 +145,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-LABEL: trunc_add_v8i32_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -154,6 +156,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -237,7 +240,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -262,6 +265,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_v16i64_v16i8: @@ -272,6 +276,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_v16i64_v16i8: @@ -282,6 +287,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -330,7 +336,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2: # BB#0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -346,6 +352,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX512: # BB#0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -392,6 +399,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_v16i16_v16i8: @@ -399,6 +407,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: @@ -406,12 +415,62 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> ret <16 x i8> %2 } +define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { +; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; SSE: # BB#0: +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = sext <8 x i8> %1 to <8 x i32> + %3 = add <8 x i32> %2, %a1 + %4 = trunc <8 x i32> %3 to <8 x i16> + ret <8 x i16> %4 +} + ; ; add to constant ; @@ -444,6 +503,7 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -491,7 +551,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -501,6 +561,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -531,7 +592,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_add_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -542,6 +603,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -573,22 +635,22 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -603,7 +665,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -628,6 +690,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8: @@ -637,6 +700,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8: @@ -646,6 +710,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -669,13 +734,13 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 @@ -684,7 +749,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_add_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -701,6 +766,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -744,6 +810,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: @@ -751,6 +818,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: @@ -758,6 +826,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -800,6 +869,7 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -857,7 +927,7 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -867,6 +937,7 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -901,7 +972,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-LABEL: trunc_sub_v8i32_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -912,6 +983,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -995,7 +1067,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -1020,6 +1092,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_v16i64_v16i8: @@ -1030,6 +1103,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8: @@ -1040,6 +1114,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -1088,7 +1163,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2: # BB#0: ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -1104,6 +1179,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX512: # BB#0: ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -1150,6 +1226,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: @@ -1157,6 +1234,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: @@ -1164,6 +1242,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1211,6 +1290,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -1272,7 +1352,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -1282,6 +1362,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -1315,7 +1396,7 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -1326,6 +1407,7 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -1411,7 +1493,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -1436,6 +1518,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8: @@ -1446,6 +1529,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8: @@ -1456,6 +1540,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -1502,7 +1587,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2: # BB#0: ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -1518,6 +1603,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -1563,6 +1649,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: @@ -1570,6 +1657,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: @@ -1577,6 +1665,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1640,6 +1729,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: @@ -1649,6 +1739,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: @@ -1658,6 +1749,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -1725,7 +1817,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] @@ -1744,6 +1836,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: @@ -1751,12 +1844,14 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -1803,7 +1898,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-LABEL: trunc_mul_v8i32_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -1814,6 +1909,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2035,7 +2131,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -2068,6 +2164,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: @@ -2080,6 +2177,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: @@ -2090,6 +2188,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -2162,7 +2261,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2: # BB#0: ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -2178,6 +2277,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX512: # BB#0: ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -2224,6 +2324,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: @@ -2231,6 +2332,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: @@ -2238,12 +2340,62 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> ret <16 x i8> %2 } +define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { +; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = mul <8 x i32> %2, %a1 + %4 = trunc <8 x i32> %3 to <8 x i16> + ret <8 x i16> %4 +} + ; ; mul to constant ; @@ -2291,6 +2443,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -2338,7 +2491,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -2348,6 +2501,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -2378,7 +2532,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -2389,6 +2543,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2558,7 +2713,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -2585,6 +2740,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: @@ -2595,6 +2751,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: @@ -2605,6 +2762,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -2677,7 +2835,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 @@ -2695,6 +2853,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -2740,6 +2899,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: @@ -2747,6 +2907,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: @@ -2754,6 +2915,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -2794,6 +2956,7 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -2847,7 +3010,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -2857,6 +3020,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -2889,7 +3053,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-LABEL: trunc_and_v8i32_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -2900,6 +3064,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2975,7 +3140,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -3000,6 +3165,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_v16i64_v16i8: @@ -3010,6 +3176,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_v16i64_v16i8: @@ -3020,6 +3187,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3064,7 +3232,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2: # BB#0: ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -3080,6 +3248,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX512: # BB#0: ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3124,6 +3293,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_v16i16_v16i8: @@ -3131,6 +3301,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: @@ -3138,6 +3309,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -3176,6 +3348,7 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3223,7 +3396,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -3233,6 +3406,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3263,7 +3437,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_and_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -3274,6 +3448,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3305,22 +3480,22 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -3335,7 +3510,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -3360,6 +3535,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8: @@ -3369,6 +3545,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8: @@ -3378,6 +3555,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3401,13 +3579,13 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 @@ -3416,7 +3594,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_and_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -3433,6 +3611,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3476,6 +3655,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: @@ -3483,6 +3663,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: @@ -3490,6 +3671,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -3530,6 +3712,7 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3583,7 +3766,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -3593,6 +3776,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3625,7 +3809,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-LABEL: trunc_xor_v8i32_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -3636,6 +3820,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3711,7 +3896,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -3736,6 +3921,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_v16i64_v16i8: @@ -3746,6 +3932,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8: @@ -3756,6 +3943,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3800,7 +3988,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -3816,6 +4004,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX512: # BB#0: ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3860,6 +4049,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: @@ -3867,6 +4057,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: @@ -3874,6 +4065,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -3912,6 +4104,7 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3959,7 +4152,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -3969,6 +4162,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3999,7 +4193,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4010,6 +4204,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4041,22 +4236,22 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -4071,7 +4266,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -4096,6 +4291,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8: @@ -4105,6 +4301,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8: @@ -4114,6 +4311,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4137,13 +4335,13 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 @@ -4152,7 +4350,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -4169,6 +4367,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4212,6 +4411,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: @@ -4219,6 +4419,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: @@ -4226,6 +4427,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4266,6 +4468,7 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4319,7 +4522,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -4329,6 +4532,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4361,7 +4565,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-LABEL: trunc_or_v8i32_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper @@ -4372,6 +4576,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4447,7 +4652,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -4472,6 +4677,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_v16i64_v16i8: @@ -4482,6 +4688,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_v16i64_v16i8: @@ -4492,6 +4699,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4536,7 +4744,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX2: # BB#0: ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -4552,6 +4760,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX512: # BB#0: ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4596,6 +4805,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_v16i16_v16i8: @@ -4603,6 +4813,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: @@ -4610,6 +4821,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4648,6 +4860,7 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4695,7 +4908,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4705,6 +4918,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4735,7 +4949,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_or_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4746,6 +4960,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4777,22 +4992,22 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -4807,7 +5022,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -4832,6 +5047,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8: @@ -4841,6 +5057,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8: @@ -4850,6 +5067,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4873,13 +5091,13 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 @@ -4888,7 +5106,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_or_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> @@ -4905,6 +5123,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4948,6 +5167,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: @@ -4955,6 +5175,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: @@ -4962,6 +5183,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5004,38 +5226,11 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: mul_add_const_v4i64_v4i32: -; AVX1: # BB#0: -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: mul_add_const_v4i64_v4i32: -; AVX2: # BB#0: -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: mul_add_const_v4i64_v4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: mul_add_const_v4i64_v4i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: mul_add_const_v4i64_v4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: retq +; AVX-LABEL: mul_add_const_v4i64_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 @@ -5086,38 +5281,11 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwin ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: mul_add_self_v4i64_v4i32: -; AVX1: # BB#0: -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: mul_add_self_v4i64_v4i32: -; AVX2: # BB#0: -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: mul_add_self_v4i64_v4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: mul_add_self_v4i64_v4i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: mul_add_self_v4i64_v4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: retq +; AVX-LABEL: mul_add_self_v4i64_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 @@ -5129,102 +5297,39 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwin define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: paddq %xmm2, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm6 +; SSE-NEXT: paddq %xmm5, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: paddq %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: paddq %xmm1, %xmm2 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: paddq %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: paddq %xmm1, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: paddq %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX1: # BB#0: -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[0,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512DQ-NEXT: retq +; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 |