diff options
Diffstat (limited to 'test/CodeGen/X86')
47 files changed, 2692 insertions, 1878 deletions
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll index 74d20f348b529..4e43f6f519210 100644 --- a/test/CodeGen/X86/avx-vperm2x128.ll +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -269,7 +269,7 @@ entry: define <4 x double> @vperm2z_0x08(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x08: ; ALL: # BB#0: -; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] ; ALL-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x double> %s @@ -279,7 +279,7 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x18: ; ALL: # BB#0: ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; ALL-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> ret <4 x double> %s @@ -288,7 +288,7 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) { define <4 x double> @vperm2z_0x28(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x28: ; ALL: # BB#0: -; ALL-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] ; ALL-NEXT: retq %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x double> %s @@ -298,7 +298,7 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x38: ; ALL: # BB#0: ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; ALL-NEXT: retq %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> ret <4 x double> %s @@ -307,7 +307,7 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) { define <4 x double> @vperm2z_0x80(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x80: ; ALL: # BB#0: -; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero ; ALL-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x double> %s @@ -316,7 +316,7 @@ define <4 x double> @vperm2z_0x80(<4 x double> %a) { define <4 x double> @vperm2z_0x81(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x81: ; ALL: # BB#0: -; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero ; ALL-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> ret <4 x double> %s @@ -325,7 +325,7 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) { define <4 x double> @vperm2z_0x82(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x82: ; ALL: # BB#0: -; ALL-NEXT: vperm2f128 $128, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],zero,zero ; ALL-NEXT: retq %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x double> %s @@ -334,7 +334,7 @@ define <4 x double> @vperm2z_0x82(<4 x double> %a) { define <4 x double> @vperm2z_0x83(<4 x double> %a) { ; ALL-LABEL: vperm2z_0x83: ; ALL: # BB#0: -; ALL-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero ; ALL-NEXT: retq %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> ret <4 x double> %s @@ -345,8 +345,8 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) { define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) { ; ALL-LABEL: vperm2z_int_0x83: ; ALL: # BB#0: -; AVX1: vperm2f128 $129, %ymm0, %ymm0, %ymm0 -; AVX2: vperm2i128 $129, %ymm0, %ymm0, %ymm0 +; AVX1: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero +; AVX2: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> %c = add <4 x i64> %b, %s ret <4 x i64> %c diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index b9f490b8a39af..7642cd4e6c5ca 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -406,20 +406,6 @@ define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2 } declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly - define <8 x i32> @test_cvtpd2udq(<8 x double> %a) { - ;CHECK: vcvtpd2udq {ru-sae}{{.*}}encoding: [0x62,0xf1,0xfc,0x58,0x79,0xc0] - %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %a, <8 x i32>zeroinitializer, i8 -1, i32 2) - ret <8 x i32>%res - } - declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32) - - define <16 x i32> @test_cvtps2udq(<16 x float> %a) { - ;CHECK: vcvtps2udq {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x79,0xc0] - %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %a, <16 x i32>zeroinitializer, i16 -1, i32 1) - ret <16 x i32>%res - } - declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32) - define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { ;CHECK: vcmpleps {sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02] %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) @@ -434,35 +420,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no } declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) - ; cvt intrinsics - define <16 x float> @test_cvtdq2ps(<16 x i32> %a) { - ;CHECK: vcvtdq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x5b,0xc0] - %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1) - ret <16 x float>%res - } - declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32) - - define <16 x float> @test_cvtudq2ps(<16 x i32> %a) { - ;CHECK: vcvtudq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7f,0x38,0x7a,0xc0] - %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1) - ret <16 x float>%res - } - declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32) - - define <8 x double> @test_cvtdq2pd(<8 x i32> %a) { - ;CHECK: vcvtdq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0] - %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1) - ret <8 x double>%res - } - declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) - - define <8 x double> @test_cvtudq2pd(<8 x i32> %a) { - ;CHECK: vcvtudq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0] - %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1) - ret <8 x double>%res - } - declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8) - ; fp min - max define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) { ; CHECK: vmaxpd @@ -482,13 +439,6 @@ define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) { declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) - define <8 x float> @test_cvtpd2ps(<8 x double> %a) { - ;CHECK: vcvtpd2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0xfd,0x38,0x5a,0xc0] - %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %a, <8 x float>zeroinitializer, i8 -1, i32 1) - ret <8 x float>%res - } - declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32) - declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16) ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512 diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 9574c016ad509..71bf63ed44d04 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -997,3 +997,44 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x ret <64 x i8> %res2 } +declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhuw {{.*}}encoding: [0x62 +define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhw {{.*}}encoding: [0x62 +define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhrsw {{.*}}encoding: [0x62 +define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { + %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 0119d3945f4e8..f5413896789a6 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -3763,3 +3763,83 @@ define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> ret <16 x i16> %res2 } +declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhuw {{.*}}encoding: [0x62 +define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhuw {{.*}}encoding: [0x62 +define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhw {{.*}}encoding: [0x62 +define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhw {{.*}}encoding: [0x62 +define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhrsw {{.*}}encoding: [0x62 +define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { + %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: {%k1} +; CHECK: vpmulhrsw {{.*}}encoding: [0x62 +define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { + %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} diff --git a/test/CodeGen/X86/cppeh-nounwind.ll b/test/CodeGen/X86/cppeh-nounwind.ll new file mode 100644 index 0000000000000..d9bc001a92df2 --- /dev/null +++ b/test/CodeGen/X86/cppeh-nounwind.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s + +; Sometimes invokes of nounwind functions make it through to CodeGen, especially +; at -O0, where Clang sometimes optimistically annotates functions as nounwind. +; WinEHPrepare ends up outlining functions, and emitting references to LSDA +; labels. Make sure we emit the LSDA in that case. + +declare i32 @__CxxFrameHandler3(...) +declare void @nounwind_func() nounwind +declare void @cleanup() + +define void @should_emit_tables() personality i32 (...)* @__CxxFrameHandler3 { +entry: + invoke void @nounwind_func() + to label %done unwind label %lpad + +done: + ret void + +lpad: + %vals = landingpad { i8*, i32 } + cleanup + call void @cleanup() + resume { i8*, i32 } %vals +} + +; CHECK: _should_emit_tables: +; CHECK: calll _nounwind_func +; CHECK: retl + +; CHECK: L__ehtable$should_emit_tables: + +; CHECK: ___ehhandler$should_emit_tables: +; CHECK: movl $L__ehtable$should_emit_tables, %eax +; CHECK: jmp ___CxxFrameHandler3 # TAILCALL diff --git a/test/CodeGen/X86/eh-nolandingpads.ll b/test/CodeGen/X86/eh-nolandingpads.ll new file mode 100644 index 0000000000000..962952266214f --- /dev/null +++ b/test/CodeGen/X86/eh-nolandingpads.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s +; Test that we emit functions with explicitly specified personality, +; even if no landing pads are left. + +declare i32 @__my_personality_v0(...) +declare void @might_throw() + +define i32 @foo() personality i32 (...)* @__my_personality_v0 { +; CHECK: .cfi_personality 3, __my_personality_v0 + call void @might_throw() + ret i32 0 +} diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll index 279bb0624ace6..34eac62e36733 100644 --- a/test/CodeGen/X86/fdiv-combine.ll +++ b/test/CodeGen/X86/fdiv-combine.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s ; Anything more than one division using a single divisor operand ; should be converted into a reciprocal and multiplication. @@ -17,9 +17,9 @@ define float @div2_arcp(float %x, float %y, float %z) #0 { ; CHECK: # BB#0: ; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: divss %xmm2, %xmm3 -; CHECK-NEXT: mulss %xmm3, %xmm0 ; CHECK-NEXT: mulss %xmm1, %xmm0 ; CHECK-NEXT: mulss %xmm3, %xmm0 +; CHECK-NEXT: mulss %xmm3, %xmm0 ; CHECK-NEXT: retq %div1 = fdiv arcp float %x, %z %mul = fmul arcp float %div1, %y @@ -27,5 +27,22 @@ define float @div2_arcp(float %x, float %y, float %z) #0 { ret float %div2 } +; If the reciprocal is already calculated, we should not +; generate an extra multiplication by 1.0. + +define double @div3_arcp(double %x, double %y, double %z) #0 { +; CHECK-LABEL: div3_arcp: +; CHECK: # BB#0: +; CHECK-NEXT: movsd{{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: divsd %xmm1, %xmm2 +; CHECK-NEXT: mulsd %xmm2, %xmm0 +; CHECK-NEXT: addsd %xmm2, %xmm0 +; CHECK-NEXT: retq + %div1 = fdiv fast double 1.0, %y + %div2 = fdiv fast double %x, %y + %ret = fadd fast double %div2, %div1 + ret double %ret +} + ; FIXME: If the backend understands 'arcp', then this attribute is unnecessary. attributes #0 = { "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/frameescape.ll b/test/CodeGen/X86/frameescape.ll index 00bc55d248781..179a936304ba0 100644 --- a/test/CodeGen/X86/frameescape.ll +++ b/test/CodeGen/X86/frameescape.ll @@ -1,19 +1,19 @@ ; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86 ; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64 -declare void @llvm.frameescape(...) +declare void @llvm.localescape(...) declare i8* @llvm.frameaddress(i32) -declare i8* @llvm.framerecover(i8*, i8*, i32) +declare i8* @llvm.localrecover(i8*, i8*, i32) declare i32 @printf(i8*, ...) @str = internal constant [10 x i8] c"asdf: %d\0A\00" define void @print_framealloc_from_fp(i8* %fp) { - %a.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0) + %a.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0) %a = bitcast i8* %a.i8 to i32* %a.val = load i32, i32* %a call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val) - %b.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1) + %b.i8 = call i8* @llvm.localrecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1) %b = bitcast i8* %b.i8 to i32* %b.val = load i32, i32* %b call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val) @@ -61,7 +61,7 @@ define void @print_framealloc_from_fp(i8* %fp) { define void @alloc_func() { %a = alloca i32 %b = alloca i32, i32 2 - call void (...) @llvm.frameescape(i32* %a, i32* %b) + call void (...) @llvm.localescape(i32* %a, i32* %b) store i32 42, i32* %a store i32 13, i32* %b %fp = call i8* @llvm.frameaddress(i32 0) @@ -105,7 +105,7 @@ define i32 @main() { define void @alloc_func_no_frameaddr() { %a = alloca i32 %b = alloca i32 - call void (...) @llvm.frameescape(i32* %a, i32* %b) + call void (...) @llvm.localescape(i32* %a, i32* %b) store i32 42, i32* %a store i32 13, i32* %b call void @print_framealloc_from_fp(i8* null) diff --git a/test/CodeGen/X86/frameregister.ll b/test/CodeGen/X86/frameregister.ll new file mode 100644 index 0000000000000..826bb9d78c9dd --- /dev/null +++ b/test/CodeGen/X86/frameregister.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnueabi | FileCheck %s +; RUN: opt < %s -O3 -S -mtriple=x86_64-linux-gnueabi | FileCheck %s --check-prefix=OPT + +define i64 @get_frame() #0 { +entry: +; CHECK-LABEL: get_frame: +; CHECK: movq %rbp, %rax + %sp = call i64 @llvm.read_register.i64(metadata !0) +; OPT: @llvm.read_register.i64 + ret i64 %sp +} + +define void @set_frame(i64 %val) #0 { +entry: +; CHECK-LABEL: set_frame: +; CHECK: movq %rdi, %rbp + call void @llvm.write_register.i64(metadata !0, i64 %val) +; OPT: @llvm.write_register.i64 + ret void +} + +declare i64 @llvm.read_register.i64(metadata) nounwind +declare void @llvm.write_register.i64(metadata, i64) nounwind + +; register unsigned long current_stack_pointer asm("rbp"); +; CHECK-NOT: .asciz "rbp" +!0 = !{!"rbp\00"} + +attributes #0 = { nounwind "no-frame-pointer-elim"="true" } diff --git a/test/CodeGen/X86/implicit-null-check-negative.ll b/test/CodeGen/X86/implicit-null-check-negative.ll index 8fbed9f7bee85..c8d425c3889fe 100644 --- a/test/CodeGen/X86/implicit-null-check-negative.ll +++ b/test/CodeGen/X86/implicit-null-check-negative.ll @@ -51,4 +51,46 @@ define i32 @imp_null_check_load_no_md(i32* %x) { ret i32 %t } +define i32 @imp_null_check_no_hoist_over_acquire_load(i32* %x, i32* %y) { +; We cannot hoist %t1 over %t0 since %t0 is an acquire load + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t0 = load atomic i32, i32* %y acquire, align 4 + %t1 = load i32, i32* %x + %p = add i32 %t0, %t1 + ret i32 %p +} + +define i32 @imp_null_check_add_result(i32* %x, i32* %y) { +; This will codegen to: +; +; movl (%rsi), %eax +; addl (%rdi), %eax +; +; The load instruction we wish to hoist is the addl, but there is a +; write-after-write hazard preventing that from happening. We could +; get fancy here and exploit the commutativity of addition, but right +; now -implicit-null-checks isn't that smart. +; + + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t0 = load i32, i32* %y + %t1 = load i32, i32* %x + %p = add i32 %t0, %t1 + ret i32 %p +} + !0 = !{} diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll index 1d1b36bbd5d06..fd7a902eefc13 100644 --- a/test/CodeGen/X86/implicit-null-check.ll +++ b/test/CodeGen/X86/implicit-null-check.ll @@ -76,6 +76,31 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) { ret i32 %p1 } +define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) { +; CHECK-LABEL: _imp_null_check_hoist_over_unrelated_load: +; CHECK: Ltmp7: +; CHECK: movl (%rdi), %eax +; CHECK: movl (%rsi), %ecx +; CHECK: movl %ecx, (%rdx) +; CHECK: retq +; CHECK: Ltmp6: +; CHECK: movl $42, %eax +; CHECK: retq + + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t0 = load i32, i32* %y + %t1 = load i32, i32* %x + store i32 %t0, i32* %z + ret i32 %t1 +} + !0 = !{} ; CHECK-LABEL: __LLVM_FaultMaps: @@ -88,7 +113,7 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) { ; CHECK-NEXT: .short 0 ; # functions: -; CHECK-NEXT: .long 3 +; CHECK-NEXT: .long 4 ; FunctionAddr: ; CHECK-NEXT: .quad _imp_null_check_add_result @@ -117,6 +142,19 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) { ; CHECK-NEXT: .long Ltmp2-_imp_null_check_gep_load ; FunctionAddr: +; CHECK-NEXT: .quad _imp_null_check_hoist_over_unrelated_load +; NumFaultingPCs +; CHECK-NEXT: .long 1 +; Reserved: +; CHECK-NEXT: .long 0 +; Fault[0].Type: +; CHECK-NEXT: .long 1 +; Fault[0].FaultOffset: +; CHECK-NEXT: .long Ltmp7-_imp_null_check_hoist_over_unrelated_load +; Fault[0].HandlerOffset: +; CHECK-NEXT: .long Ltmp6-_imp_null_check_hoist_over_unrelated_load + +; FunctionAddr: ; CHECK-NEXT: .quad _imp_null_check_load ; NumFaultingPCs ; CHECK-NEXT: .long 1 @@ -131,10 +169,12 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) { ; OBJDUMP: FaultMap table: ; OBJDUMP-NEXT: Version: 0x1 -; OBJDUMP-NEXT: NumFunctions: 3 +; OBJDUMP-NEXT: NumFunctions: 4 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1 +; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7 +; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 3 diff --git a/test/CodeGen/X86/inline-asm-bad-constraint-n.ll b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll new file mode 100644 index 0000000000000..91b1ffed4e0fb --- /dev/null +++ b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll @@ -0,0 +1,10 @@ +; RUN: not llc -march=x86 -no-integrated-as < %s 2>&1 | FileCheck %s + +@x = global i32 0, align 4 + +;CHECK: error: invalid operand for inline asm constraint 'n' +define void @foo() { + %a = getelementptr i32, i32* @x, i32 1 + call void asm sideeffect "foo $0", "n"(i32* %a) nounwind + ret void +} diff --git a/test/CodeGen/X86/legalize-shl-vec.ll b/test/CodeGen/X86/legalize-shl-vec.ll new file mode 100644 index 0000000000000..7ec2a663513f5 --- /dev/null +++ b/test/CodeGen/X86/legalize-shl-vec.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s + +define <2 x i256> @test_shl(<2 x i256> %In) { + %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 + %Out = shl <2 x i256> %In, %Amt + ret <2 x i256> %Out + + ; CHECK-LABEL: test_shl + ; CHECK: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK: retq +} + +define <2 x i256> @test_srl(<2 x i256> %In) { + %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 + %Out = lshr <2 x i256> %In, %Amt + ret <2 x i256> %Out + + ; CHECK-LABEL: test_srl + ; CHECK: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK-NEXT: movq $0 + ; CHECK: retq +} + +define <2 x i256> @test_sra(<2 x i256> %In) { + %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 + %Out = ashr <2 x i256> %In, %Amt + ret <2 x i256> %Out + + ; CHECK-LABEL: test_sra + ; CHECK: sarq $63 +} diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll index d4cd59ffac3ac..0943bebbb0999 100644 --- a/test/CodeGen/X86/machine-combiner.ll +++ b/test/CodeGen/X86/machine-combiner.ll @@ -1,15 +1,23 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds1: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: reassociate_adds1: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_adds1: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = fadd float %x0, %x1 %t1 = fadd float %t0, %x2 %t2 = fadd float %t1, %x3 @@ -17,12 +25,19 @@ define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { } define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds2: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: reassociate_adds2: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_adds2: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = fadd float %x0, %x1 %t1 = fadd float %x2, %t0 %t2 = fadd float %t1, %x3 @@ -30,12 +45,19 @@ define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { } define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds3: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: reassociate_adds3: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_adds3: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = fadd float %x0, %x1 %t1 = fadd float %t0, %x2 %t2 = fadd float %x3, %t1 @@ -43,12 +65,19 @@ define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { } define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds4: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: reassociate_adds4: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_adds4: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = fadd float %x0, %x1 %t1 = fadd float %x2, %t0 %t2 = fadd float %x3, %t1 @@ -59,16 +88,27 @@ define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { ; produced because that would cost more compile time. define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { -; CHECK-LABEL: reassociate_adds5: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1 -; CHECK-NEXT: vaddss %xmm6, %xmm1, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm7, %xmm0, %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: reassociate_adds5: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm0 +; SSE-NEXT: addss %xmm5, %xmm4 +; SSE-NEXT: addss %xmm6, %xmm4 +; SSE-NEXT: addss %xmm4, %xmm0 +; SSE-NEXT: addss %xmm7, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_adds5: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm1 +; AVX-NEXT: vaddss %xmm6, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm7, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = fadd float %x0, %x1 %t1 = fadd float %t0, %x2 %t2 = fadd float %t1, %x3 @@ -83,17 +123,90 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa ; Also, we should reassociate such that the result of the high latency division ; is used by the final 'add' rather than reassociating the %x3 operand with the ; division. The latter reassociation would not improve anything. - + define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds6: -; CHECK: # BB#0: -; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; SSE-LABEL: reassociate_adds6: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_adds6: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = fdiv float %x0, %x1 %t1 = fadd float %x2, %t0 %t2 = fadd float %x3, %t1 ret float %t2 } +; Verify that SSE and AVX scalar single-precison multiplies are reassociated. + +define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) { +; SSE-LABEL: reassociate_muls1: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm3, %xmm2 +; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_muls1: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %t0 = fdiv float %x0, %x1 + %t1 = fmul float %x2, %t0 + %t2 = fmul float %x3, %t1 + ret float %t2 +} + +; Verify that SSE and AVX scalar double-precison adds are reassociated. + +define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) { +; SSE-LABEL: reassociate_adds_double: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm3, %xmm2 +; SSE-NEXT: addsd %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_adds_double: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %t0 = fdiv double %x0, %x1 + %t1 = fadd double %x2, %t0 + %t2 = fadd double %x3, %t1 + ret double %t2 +} + +; Verify that SSE and AVX scalar double-precison multiplies are reassociated. + +define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) { +; SSE-LABEL: reassociate_muls_double: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm3, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: reassociate_muls_double: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %t0 = fdiv double %x0, %x1 + %t1 = fmul double %x2, %t0 + %t2 = fmul double %x3, %t1 + ret double %t2 +} + diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll index faaec262cb917..a6b721a7a6f17 100644 --- a/test/CodeGen/X86/pr13577.ll +++ b/test/CodeGen/X86/pr13577.ll @@ -1,5 +1,20 @@ -; RUN: llc < %s -march=x86-64 +; RUN: llc < %s -mtriple=x86_64-darwin | FileCheck %s +; CHECK-LABEL: LCPI0_0: +; CHECK-NEXT: .long 4286578688 +; CHECK-LABEL: LCPI0_1: +; CHECK-NEXT: .long 2139095040 + +; CHECK-LABEL: foo: +; CHECK: movq {{.*}}, %rax +; CHECK: shlq $48, %rax +; CHECK: sets %al +; CHECK: testb %al, %al +; CHECK: flds LCPI0_0(%rip) +; CHECK: flds LCPI0_1(%rip) +; CHECK: fcmovne %st(1), %st(0) +; CHECK: fstp %st(1) +; CHECK: retq define x86_fp80 @foo(x86_fp80 %a) { %1 = tail call x86_fp80 @copysignl(x86_fp80 0xK7FFF8000000000000000, x86_fp80 %a) nounwind readnone ret x86_fp80 %1 diff --git a/test/CodeGen/X86/read-fp-no-frame-pointer.ll b/test/CodeGen/X86/read-fp-no-frame-pointer.ll new file mode 100644 index 0000000000000..9f78c294ce88e --- /dev/null +++ b/test/CodeGen/X86/read-fp-no-frame-pointer.ll @@ -0,0 +1,12 @@ +; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s + +define i32 @get_frame() nounwind { +entry: +; CHECK: register ebp is allocatable: function has no frame pointer + %fp = call i32 @llvm.read_register.i32(metadata !0) + ret i32 %fp +} + +declare i32 @llvm.read_register.i32(metadata) nounwind + +!0 = !{!"ebp\00"} diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll index 423b9914e99d2..a4ea8ab78c798 100644 --- a/test/CodeGen/X86/seh-catch-all-win32.ll +++ b/test/CodeGen/X86/seh-catch-all-win32.ll @@ -10,14 +10,14 @@ declare void @crash() declare i32 @printf(i8* nocapture readonly, ...) nounwind declare i32 @llvm.eh.typeid.for(i8*) declare i8* @llvm.frameaddress(i32) -declare i8* @llvm.framerecover(i8*, i8*, i32) -declare void @llvm.frameescape(...) +declare i8* @llvm.localrecover(i8*, i8*, i32) +declare void @llvm.localescape(...) declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) { entry: %__exceptioncode = alloca i32, align 4 - call void (...) @llvm.frameescape(i32* %__exceptioncode) + call void (...) @llvm.localescape(i32* %__exceptioncode) invoke void @crash() #5 to label %__try.cont unwind label %lpad @@ -45,7 +45,7 @@ define internal i32 @"filt$main"() { entry: %ebp = tail call i8* @llvm.frameaddress(i32 1) %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp) - %code.i8 = tail call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0) + %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0) %__exceptioncode = bitcast i8* %code.i8 to i32* %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20 %0 = bitcast i8* %info.addr to i32*** @@ -59,26 +59,38 @@ entry: ; Check that we can get the exception code from eax to the printf. ; CHECK-LABEL: _main: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; Ensure that we push *all* the CSRs, since they are clobbered by the +; __except block. +; CHECK: pushl %ebx +; CHECK: pushl %edi +; CHECK: pushl %esi + ; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]] ; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]] ; CHECK: movl %esp, [[reg_offs]](%ebp) ; CHECK: movl $L__ehtable$main, ; EH state 0 -; CHECK: movl $0, -4(%ebp) +; CHECK: movl $0, -16(%ebp) ; CHECK: calll _crash +; CHECK: popl %esi +; CHECK: popl %edi +; CHECK: popl %ebx ; CHECK: retl ; CHECK: # Block address taken ; stackrestore -; CHECK: movl [[reg_offs]](%ebp), %esp +; CHECK: movl -24(%ebp), %esp ; EH state -1 ; CHECK: movl [[code_offs]](%ebp), %[[code:[a-z]+]] -; CHECK: movl $-1, -4(%ebp) +; CHECK: movl $-1, -16(%ebp) ; CHECK-DAG: movl %[[code]], 4(%esp) ; CHECK-DAG: movl $_str, (%esp) ; CHECK: calll _printf ; CHECK: .section .xdata,"dr" ; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1 +; CHECK: .align 4 ; CHECK: L__ehtable$main ; CHECK-NEXT: .long -1 ; CHECK-NEXT: .long _filt$main diff --git a/test/CodeGen/X86/seh-except-finally.ll b/test/CodeGen/X86/seh-except-finally.ll index 4327a64468f92..0630d001bb764 100644 --- a/test/CodeGen/X86/seh-except-finally.ll +++ b/test/CodeGen/X86/seh-except-finally.ll @@ -41,7 +41,7 @@ entry: to label %invoke.cont unwind label %lpad invoke.cont: ; preds = %entry - %0 = call i8* @llvm.frameaddress(i32 0) + %0 = call i8* @llvm.localaddress() invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext false, i8* %0) #5 to label %invoke.cont2 unwind label %lpad1 @@ -56,7 +56,7 @@ lpad: ; preds = %entry store i8* %2, i8** %exn.slot %3 = extractvalue { i8*, i32 } %1, 1 store i32 %3, i32* %ehselector.slot - %4 = call i8* @llvm.frameaddress(i32 0) + %4 = call i8* @llvm.localaddress() invoke void @"\01?fin$0@0@use_both@@"(i1 zeroext true, i8* %4) #5 to label %invoke.cont3 unwind label %lpad1 @@ -153,7 +153,7 @@ declare i32 @puts(i8*) #3 declare i32 @__C_specific_handler(...) ; Function Attrs: nounwind readnone -declare i8* @llvm.frameaddress(i32) #4 +declare i8* @llvm.localaddress() #4 ; Function Attrs: nounwind readnone declare i32 @llvm.eh.typeid.for(i8*) #4 diff --git a/test/CodeGen/X86/seh-stack-realign-win32.ll b/test/CodeGen/X86/seh-stack-realign-win32.ll new file mode 100644 index 0000000000000..f3ab71803ca7a --- /dev/null +++ b/test/CodeGen/X86/seh-stack-realign-win32.ll @@ -0,0 +1,99 @@ +; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s + +; 32-bit catch-all has to use a filter function because that's how it saves the +; exception code. + +@str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1 + +declare i32 @_except_handler3(...) +declare void @crash() +declare i32 @printf(i8* nocapture readonly, ...) nounwind +declare i32 @llvm.eh.typeid.for(i8*) +declare i8* @llvm.frameaddress(i32) +declare i8* @llvm.localrecover(i8*, i8*, i32) +declare void @llvm.localescape(...) +declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) + +define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) { +entry: + ; The EH code allocation is overaligned, triggering realignment. + %__exceptioncode = alloca i32, align 8 + call void (...) @llvm.localescape(i32* %__exceptioncode) + invoke void @crash() #5 + to label %__try.cont unwind label %lpad + +lpad: ; preds = %entry + %0 = landingpad { i8*, i32 } + catch i8* bitcast (i32 ()* @"filt$main" to i8*) + %1 = extractvalue { i8*, i32 } %0, 1 + %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4 + %matches = icmp eq i32 %1, %2 + br i1 %matches, label %__except, label %eh.resume + +__except: ; preds = %lpad + %3 = load i32, i32* %__exceptioncode, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4 + br label %__try.cont + +__try.cont: ; preds = %entry, %__except + ret i32 0 + +eh.resume: ; preds = %lpad + resume { i8*, i32 } %0 +} + +define internal i32 @"filt$main"() { +entry: + %ebp = tail call i8* @llvm.frameaddress(i32 1) + %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp) + %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0) + %__exceptioncode = bitcast i8* %code.i8 to i32* + %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20 + %0 = bitcast i8* %info.addr to i32*** + %1 = load i32**, i32*** %0, align 4 + %2 = load i32*, i32** %1, align 4 + %3 = load i32, i32* %2, align 4 + store i32 %3, i32* %__exceptioncode, align 4 + ret i32 1 +} + +; Check that we can get the exception code from eax to the printf. + +; CHECK-LABEL: _main: +; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]] +; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]] +; CHECK: movl %esp, [[reg_offs]](%esi) +; CHECK: movl $L__ehtable$main, +; EH state 0 +; CHECK: movl $0, 40(%esi) +; CHECK: calll _crash +; CHECK: retl +; CHECK: # Block address taken +; stackrestore +; CHECK: movl -24(%ebp), %esp +; CHECK: movl $Lmain$parent_frame_offset, %eax +; CHECK: negl %eax +; CHECK: leal -24(%ebp,%eax), %esi +; CHECK: movl 12(%esi), %ebp # 4-byte Reload +; EH state -1 +; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]] +; CHECK: movl $-1, 40(%esi) +; CHECK-DAG: movl %[[code]], 4(%esp) +; CHECK-DAG: movl $_str, (%esp) +; CHECK: calll _printf + +; CHECK: .section .xdata,"dr" +; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1 +; CHECK: L__ehtable$main +; CHECK-NEXT: .long -1 +; CHECK-NEXT: .long _filt$main +; CHECK-NEXT: .long Ltmp{{[0-9]+}} + +; CHECK-LABEL: _filt$main: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: movl (%ebp), %[[oldebp:[a-z]+]] +; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]] +; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]] +; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]] +; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}}) diff --git a/test/CodeGen/X86/seh-stack-realign.ll b/test/CodeGen/X86/seh-stack-realign.ll new file mode 100644 index 0000000000000..f2fb28a081f9d --- /dev/null +++ b/test/CodeGen/X86/seh-stack-realign.ll @@ -0,0 +1,101 @@ +; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s + +; 32-bit catch-all has to use a filter function because that's how it saves the +; exception code. + +@str = linkonce_odr unnamed_addr constant [27 x i8] c"GetExceptionCode(): 0x%lx\0A\00", align 1 + +declare i32 @_except_handler3(...) +declare void @crash() +declare i32 @printf(i8* nocapture readonly, ...) nounwind +declare i32 @llvm.eh.typeid.for(i8*) +declare i8* @llvm.frameaddress(i32) +declare i8* @llvm.localrecover(i8*, i8*, i32) +declare void @llvm.localescape(...) +declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) + +define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) { +entry: + ; The EH code allocation is overaligned, triggering realignment. + %__exceptioncode = alloca i32, align 8 + call void (...) @llvm.localescape(i32* %__exceptioncode) + invoke void @crash() #5 + to label %__try.cont unwind label %lpad + +lpad: ; preds = %entry + %0 = landingpad { i8*, i32 } + catch i8* bitcast (i32 ()* @"filt$main" to i8*) + %1 = extractvalue { i8*, i32 } %0, 1 + %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 ()* @"filt$main" to i8*)) #4 + %matches = icmp eq i32 %1, %2 + br i1 %matches, label %__except, label %eh.resume + +__except: ; preds = %lpad + %3 = load i32, i32* %__exceptioncode, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str, i32 0, i32 0), i32 %3) #4 + br label %__try.cont + +__try.cont: ; preds = %entry, %__except + ret i32 0 + +eh.resume: ; preds = %lpad + resume { i8*, i32 } %0 +} + +define internal i32 @"filt$main"() { +entry: + %ebp = tail call i8* @llvm.frameaddress(i32 1) + %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp) + %code.i8 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0) + %__exceptioncode = bitcast i8* %code.i8 to i32* + %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20 + %0 = bitcast i8* %info.addr to i32*** + %1 = load i32**, i32*** %0, align 4 + %2 = load i32*, i32** %1, align 4 + %3 = load i32, i32* %2, align 4 + store i32 %3, i32* %__exceptioncode, align 4 + ret i32 1 +} + +; Check that we can get the exception code from eax to the printf. + +; CHECK-LABEL: _main: +; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]] +; CHECK: Lmain$frame_escape_1 = [[reg_offs:[-0-9]+]] +; CHECK: movl %esp, [[reg_offs]](%esi) +; CHECK: movl $L__ehtable$main, +; EH state 0 +; CHECK: movl $0, 40(%esi) +; CHECK: calll _crash +; CHECK: retl +; CHECK: # Block address taken +; Restore ESP +; CHECK: movl -24(%ebp), %esp +; Restore ESI +; CHECK: movl $Lmain$parent_frame_offset, %eax +; CHECK: negl %eax +; CHECK: leal -24(%ebp,%eax), %esi +; Restore EBP +; CHECK: movl 12(%esi), %ebp # 4-byte Reload +; EH state -1 +; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]] +; CHECK: movl $-1, 40(%esi) +; CHECK-DAG: movl %[[code]], 4(%esp) +; CHECK-DAG: movl $_str, (%esp) +; CHECK: calll _printf + +; CHECK: .section .xdata,"dr" +; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1 +; CHECK: L__ehtable$main +; CHECK-NEXT: .long -1 +; CHECK-NEXT: .long _filt$main +; CHECK-NEXT: .long Ltmp{{[0-9]+}} + +; CHECK-LABEL: _filt$main: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: movl (%ebp), %[[oldebp:[a-z]+]] +; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]] +; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]] +; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]] +; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}}) diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll index 0f8d9f4d713fa..9b851db8121c4 100644 --- a/test/CodeGen/X86/sqrt-fastmath.ll +++ b/test/CodeGen/X86/sqrt-fastmath.ll @@ -34,11 +34,11 @@ define float @ff(float %f) #0 { ; ESTIMATE: # BB#0: ; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2 -; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm1 -; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm1 +; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm3 +; ESTIMATE-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 +; ESTIMATE-NEXT: vmulss %xmm0, %xmm2, %xmm2 ; ESTIMATE-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; ESTIMATE-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; ESTIMATE-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; ESTIMATE-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0 ; ESTIMATE-NEXT: vandnps %xmm1, %xmm0, %xmm0 @@ -78,7 +78,7 @@ define float @reciprocal_square_root(float %x) #0 { ; ESTIMATE: # BB#0: ; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2 -; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm1 +; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; ESTIMATE-NEXT: vmulss %xmm2, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll index 7c8d5e5788983..45028cf4bd372 100644 --- a/test/CodeGen/X86/sse2-vector-shifts.ll +++ b/test/CodeGen/X86/sse2-vector-shifts.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s ; SSE2 Logical Shift Left diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll index c1cd91beaf531..398675276c664 100644 --- a/test/CodeGen/X86/sse3.ll +++ b/test/CodeGen/X86/sse3.ll @@ -1,6 +1,6 @@ ; These are tests for SSE3 codegen. -; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 --mattr=+sse3 | FileCheck %s --check-prefix=X64 ; Test for v8xi16 lowering where we extract the first element of the vector and ; placed it in the second element of the result. diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll index c7c1fc9463866..63aa742bdf018 100644 --- a/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -1409,12 +1409,26 @@ define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) { } declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone -; TODO stack_fold_roundsd +define double @stack_fold_roundsd(double %a0) optsize { + ;CHECK-LABEL: stack_fold_roundsd + ;CHECK: vroundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call double @llvm.floor.f64(double %a0) + ret double %2 +} +declare double @llvm.floor.f64(double) nounwind readnone ; TODO stack_fold_roundsd_int declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone -; TODO stack_fold_roundss +define float @stack_fold_roundss(float %a0) optsize { + ;CHECK-LABEL: stack_fold_roundss + ;CHECK: vroundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call float @llvm.floor.f32(float %a0) + ret float %2 +} +declare float @llvm.floor.f32(float) nounwind readnone ; TODO stack_fold_roundss_int declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll index 63acf5f4f96f4..f9fcbaabdebb4 100644 --- a/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -884,11 +884,29 @@ define <4 x float> @stack_fold_roundps(<4 x float> %a0) { } declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone -; TODO stack_fold_roundsd +define double @stack_fold_roundsd(double %a0) optsize { + ;CHECK-LABEL: stack_fold_roundsd + ;CHECK: roundsd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call double @llvm.floor.f64(double %a0) + ret double %2 +} +declare double @llvm.floor.f64(double) nounwind readnone + ; TODO stack_fold_roundsd_int +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +define float @stack_fold_roundss(float %a0) optsize { + ;CHECK-LABEL: stack_fold_roundss + ;CHECK: roundss $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call float @llvm.floor.f32(float %a0) + ret float %2 +} +declare float @llvm.floor.f32(float) nounwind readnone -; TODO stack_fold_roundss ; TODO stack_fold_roundss_int +declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone ; TODO stack_fold_rsqrtps @@ -938,13 +956,25 @@ define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) { } declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone -; TODO stack_fold_sqrtsd +define double @stack_fold_sqrtsd(double %a0) optsize { + ;CHECK-LABEL: stack_fold_sqrtsd + ;CHECK: sqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call double @llvm.sqrt.f64(double %a0) + ret double %2 +} declare double @llvm.sqrt.f64(double) nounwind readnone ; TODO stack_fold_sqrtsd_int declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone -; TODO stack_fold_sqrtss +define float @stack_fold_sqrtss(float %a0) optsize { + ;CHECK-LABEL: stack_fold_sqrtss + ;CHECK: sqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call float @llvm.sqrt.f32(float %a0) + ret float %2 +} declare float @llvm.sqrt.f32(float) nounwind readnone ; TODO stack_fold_sqrtss_int diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index 3e72212d85d3e..3b1b2f5c1c775 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -1,5 +1,10 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; +; 32-bit tests to make sure we're not doing anything stupid. +; RUN: llc < %s -mtriple=i686-unknown-unknown +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 ; ; Double to Signed Integer diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index ca8be65075b90..4a3d088139040 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -1,6 +1,11 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; +; 32-bit tests to make sure we're not doing anything stupid. +; RUN: llc < %s -mtriple=i686-unknown-unknown +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 ; ; Signed Integer to Double @@ -279,36 +284,19 @@ define <2 x double> @uitofp_2vf64_i32(<4 x i32> %a) { define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) { ; SSE2-LABEL: uitofp_2vf64_i16: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; SSE2-NEXT: subpd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: subpd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: pand .LCPI10_0(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: uitofp_2vf64_i16: ; AVX: # BB#0: ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-NEXT: vpand .LCPI10_0(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> %cvt = uitofp <2 x i16> %shuf to <2 x double> @@ -318,37 +306,20 @@ define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) { define <2 x double> @uitofp_2vf64_i8(<16 x i8> %a) { ; SSE2-LABEL: uitofp_2vf64_i8: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; SSE2-NEXT: subpd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: subpd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pand .LCPI11_0(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: uitofp_2vf64_i8: ; AVX: # BB#0: ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-NEXT: vpand .LCPI11_0(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> %cvt = uitofp <2 x i8> %shuf to <2 x double> @@ -493,34 +464,11 @@ define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) { ; SSE2-LABEL: uitofp_4vf64_i16: ; SSE2: # BB#0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] -; SSE2-NEXT: subpd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] -; SSE2-NEXT: addpd %xmm5, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: subpd %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] -; SSE2-NEXT: addpd %xmm1, %xmm5 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pand .LCPI14_2(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: subpd %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE2-NEXT: subpd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] -; SSE2-NEXT: addpd %xmm5, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: uitofp_4vf64_i16: @@ -536,38 +484,13 @@ define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) { define <4 x double> @uitofp_4vf64_i8(<16 x i8> %a) { ; SSE2-LABEL: uitofp_4vf64_i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; SSE2-NEXT: subpd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] -; SSE2-NEXT: addpd %xmm5, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: subpd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] -; SSE2-NEXT: addpd %xmm4, %xmm5 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pand .LCPI15_2(%rip), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: subpd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1] -; SSE2-NEXT: addpd %xmm4, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE2-NEXT: subpd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] -; SSE2-NEXT: addpd %xmm5, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: uitofp_4vf64_i8: diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll index ce98e6759b659..47878360ca0af 100644 --- a/test/CodeGen/X86/vector-gep.ll +++ b/test/CodeGen/X86/vector-gep.ll @@ -92,3 +92,25 @@ entry: ;CHECK: ret } +;CHECK-LABEL: AGEP7: +define <4 x i8*> @AGEP7(<4 x i8*> %param, i32 %off) nounwind { +entry: +;CHECK: vbroadcastss +;CHECK: vpadd + %A = getelementptr i8, <4 x i8*> %param, i32 %off + ret <4 x i8*> %A +;CHECK: ret +} + +;CHECK-LABEL: AGEP8: +define <4 x i16*> @AGEP8(i16* %param, <4 x i32> %off) nounwind { +entry: +; Multiply offset by two (add it to itself). +;CHECK: vpadd +; add the base to the offset +;CHECK: vbroadcastss +;CHECK-NEXT: vpadd + %A = getelementptr i16, i16* %param, <4 x i32> %off + ret <4 x i16*> %A +;CHECK: ret +} diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index aafc05b2ed4ce..8e79493ddd073 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -160,14 +160,14 @@ entry: define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { ; SSE2-LABEL: load_sext_test1: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_test1: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: retq @@ -196,7 +196,7 @@ entry: define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_test2: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 @@ -204,7 +204,7 @@ define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { ; ; SSSE3-LABEL: load_sext_test2: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $24, %xmm0 @@ -280,7 +280,7 @@ entry: define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { ; SSE2-LABEL: load_sext_test4: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -290,7 +290,7 @@ define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { ; ; SSSE3-LABEL: load_sext_test4: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -322,7 +322,7 @@ entry: define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { ; SSE2-LABEL: load_sext_test5: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -330,7 +330,7 @@ define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { ; ; SSSE3-LABEL: load_sext_test5: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -360,14 +360,14 @@ entry: define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { ; SSE2-LABEL: load_sext_test6: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_test6: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psraw $8, %xmm0 ; SSSE3-NEXT: retq @@ -463,20 +463,20 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE2-LABEL: sext_16i8_to_16i16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: movq 8(%rdi), %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_16i8_to_16i16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: movq 8(%rdi), %xmm1 +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psraw $8, %xmm1 ; SSSE3-NEXT: retq diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index 4fd2f8b51b8b2..61b30154950d2 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -10,43 +10,43 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: var_shift_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movd %xmm1, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movd %xmm1, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: pextrq $1, %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movd %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: pextrq $1, %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movd %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shift_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: sarq %cl, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: sarq %cl, %rax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vpextrq $1, %xmm1, %rcx +; AVX-NEXT: sarq %cl, %rax +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovq %xmm1, %rcx +; AVX-NEXT: sarq %cl, %rax +; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX-NEXT: retq %shift = ashr <2 x i64> %a, %b @@ -56,73 +56,63 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: var_shift_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: sarl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: sarl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: sarl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: sarl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrad %xmm2, %xmm4 +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrad %xmm4, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: psrad %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: pextrd $1, %xmm1, %ecx -; SSE41-NEXT: sarl %cl, %eax -; SSE41-NEXT: movd %xmm0, %edx -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: sarl %cl, %edx -; SSE41-NEXT: movd %edx, %xmm2 -; SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; SSE41-NEXT: pextrd $2, %xmm0, %eax -; SSE41-NEXT: pextrd $2, %xmm1, %ecx -; SSE41-NEXT: sarl %cl, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm2 -; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: pextrd $3, %xmm1, %ecx -; SSE41-NEXT: sarl %cl, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad %xmm2, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrad %xmm1, %xmm2 +; SSE41-NEXT: psrad %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %edx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i32: @@ -136,84 +126,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: var_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: psllw $12, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psraw $4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psraw $2, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $2, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $1, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -221,9 +211,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %shift = ashr <8 x i16> %a, %b @@ -234,123 +224,123 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: var_shift_v16i8: ; SSE2: # BB#0: ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $2, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $4, %xmm4 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $2, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %shift = ashr <16 x i8> %a, %b ret <16 x i8> %shift @@ -363,61 +353,61 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: splatvar_shift_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: movd %xmm2, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: sarq %cl, %rax -; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movd %xmm2, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: pextrq $1, %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movd %xmm1, %rcx -; SSE41-NEXT: sarq %cl, %rax -; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: pextrq $1, %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movd %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: sarq %cl, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v2i64: ; AVX2: # BB#0: ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: sarq %cl, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: retq %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i64> %a, %splat @@ -453,10 +443,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: splatvar_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movzwl %ax, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: psraw %xmm1, %xmm0 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psraw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v8i16: @@ -481,160 +471,160 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE2-NEXT: psllw $5, %xmm3 +; SSE2-NEXT: psllw $5, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: psraw $4, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: psraw $2, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: psraw $1, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm1 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $4, %xmm4 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $2, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v16i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer %shift = ashr <16 x i8> %a, %splat @@ -648,36 +638,36 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: sarq %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: sarq $7, %rax -; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: sarq %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: sarq $7, %rax +; SSE2-NEXT: movd %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: sarq $7, %rax -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: sarq %rax -; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: sarq $7, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: sarq %rax +; SSE41-NEXT: movd %rax, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_shift_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: sarq $7, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: sarq %rax -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: sarq $7, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: sarq %rax +; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %shift = ashr <2 x i64> %a, <i64 1, i64 7> @@ -687,58 +677,43 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; SSE2-LABEL: constant_shift_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: sarl $7, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: sarl $5, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: sarl $4, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: sarl $6, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $7, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $5, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $6, %xmm2 +; SSE2-NEXT: psrad $4, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: sarl $5, %eax -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: sarl $4, %ecx -; SSE41-NEXT: movd %ecx, %xmm1 -; SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; SSE41-NEXT: pextrd $2, %xmm0, %eax -; SSE41-NEXT: sarl $6, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm1 -; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: sarl $7, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $7, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrad $5, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $6, %xmm1 +; SSE41-NEXT: psrad $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: sarl $5, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: sarl $4, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: sarl $6, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: sarl $7, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i32: @@ -752,56 +727,56 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; SSE2-LABEL: constant_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psraw $4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; SSE2-NEXT: psraw $2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $8, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $8, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784] +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568] +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600] +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664] +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -809,9 +784,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> @@ -822,126 +797,126 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; SSE2-LABEL: constant_shift_v16i8: ; SSE2: # BB#0: ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; SSE2-NEXT: psllw $5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE2-NEXT: psllw $5, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: psraw $4, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: psraw $2, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: psraw $1, %xmm1 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; SSE41-NEXT: psllw $5, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE41-NEXT: psllw $5, %xmm3 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $4, %xmm4 -; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $2, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psraw $4, %xmm3 -; SSE41-NEXT: pblendvb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psraw $2, %xmm3 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psraw $1, %xmm3 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psraw $4, %xmm3 +; SSE41-NEXT: pblendvb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psraw $2, %xmm3 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psraw $1, %xmm3 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> ret <16 x i8> %shift @@ -954,38 +929,35 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { ; SSE2-LABEL: splatconstant_shift_v2i64: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: sarq $7, %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: sarq $7, %rax -; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: psrlq $7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatconstant_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: sarq $7, %rax -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: sarq $7, %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $7, %xmm1 +; SSE41-NEXT: psrlq $7, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v2i64: -; AVX: # BB#0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: sarq $7, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: sarq $7, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrad $7, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlq $7, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %shift = ashr <2 x i64> %a, <i64 7, i64 7> ret <2 x i64> %shift } @@ -1021,20 +993,20 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; SSE-LABEL: splatconstant_shift_v16i8: ; SSE: # BB#0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <16 x i8> %shift diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index 3fc377af56500..e4642558e0e43 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -63,39 +63,30 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: var_shift_v8i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpextrd $1, %xmm2, %eax ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpextrd $1, %xmm3, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vmovd %xmm2, %edx -; AVX1-NEXT: vmovd %xmm3, %ecx -; AVX1-NEXT: sarl %cl, %edx -; AVX1-NEXT: vmovd %edx, %xmm4 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrd $2, %xmm2, %eax -; AVX1-NEXT: vpextrd $2, %xmm3, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrd $3, %xmm2, %eax -; AVX1-NEXT: vpextrd $3, %xmm3, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2 -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %edx -; AVX1-NEXT: vmovd %edx, %xmm3 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: sarl %cl, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -489,32 +480,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; AVX1-LABEL: constant_shift_v8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: sarl $9, %eax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: sarl $8, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: sarl $8, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: sarl $7, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: sarl $5, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: sarl $4, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: sarl $6, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: sarl $7, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $4, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $9, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i32: @@ -663,41 +642,20 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; AVX1-LABEL: splatconstant_shift_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: sarq $7, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: sarq $7, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: sarq $7, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: sarq $7, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatconstant_shift_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: sarq $7, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: sarq $7, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: sarq $7, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: sarq $7, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1 +; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: retq %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> ret <4 x i64> %shift @@ -756,11 +714,11 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; ; AVX2-LABEL: splatconstant_shift_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %shift diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index f5a7e28383fe5..ca55800e2713d 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -12,26 +12,26 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrlq %xmm3, %xmm2 -; SSE2-NEXT: psrlq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: psrlq %xmm3, %xmm2 +; SSE2-NEXT: psrlq %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE2-NEXT: movapd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrlq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psrlq %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: psrlq %xmm1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; @@ -46,73 +46,63 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: var_shift_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: shrl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] -; SSE2-NEXT: movd %xmm3, %ecx -; SSE2-NEXT: shrl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: shrl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: shrl %cl, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrld %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrld %xmm2, %xmm4 +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: psrld %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: pextrd $1, %xmm1, %ecx -; SSE41-NEXT: shrl %cl, %eax -; SSE41-NEXT: movd %xmm0, %edx -; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: shrl %cl, %edx -; SSE41-NEXT: movd %edx, %xmm2 -; SSE41-NEXT: pinsrd $1, %eax, %xmm2 -; SSE41-NEXT: pextrd $2, %xmm0, %eax -; SSE41-NEXT: pextrd $2, %xmm1, %ecx -; SSE41-NEXT: shrl %cl, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm2 -; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: pextrd $3, %xmm1, %ecx -; SSE41-NEXT: shrl %cl, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm2, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld %xmm1, %xmm2 +; SSE41-NEXT: psrld %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %edx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i32: @@ -126,84 +116,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: var_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: psllw $12, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $2, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -211,9 +201,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %shift = lshr <8 x i16> %a, %b @@ -223,72 +213,72 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: var_shift_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $1, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %shift = lshr <16 x i8> %a, %b @@ -343,10 +333,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: splatvar_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movzwl %ax, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v8i16: @@ -370,99 +360,99 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] -; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm1 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm4 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm4 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $2, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v16i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer %shift = lshr <16 x i8> %a, %splat @@ -477,24 +467,24 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $7, %xmm1 -; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: psrlq $7, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlq $7, %xmm1 -; SSE41-NEXT: psrlq $1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlq $7, %xmm1 +; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; @@ -509,59 +499,44 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; SSE2-LABEL: constant_shift_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: shrl $7, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: shrl $5, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: shrl $4, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: shrl $6, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $7, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrld $5, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrld $6, %xmm2 +; SSE2-NEXT: psrld $4, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i32: -; SSE41: # BB#0: -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: shrl $5, %eax -; SSE41-NEXT: movd %xmm0, %ecx -; SSE41-NEXT: shrl $4, %ecx -; SSE41-NEXT: movd %ecx, %xmm1 -; SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; SSE41-NEXT: pextrd $2, %xmm0, %eax -; SSE41-NEXT: shrl $6, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm1 -; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: shrl $7, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $7, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $5, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $6, %xmm1 +; SSE41-NEXT: psrld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i32: -; AVX1: # BB#0: -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: shrl $5, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: shrl $6, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: shrl $7, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 -; AVX1-NEXT: retq +; AVX1: # BB#0: +; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 +; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i32: ; AVX2: # BB#0: @@ -574,56 +549,56 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; SSE2-LABEL: constant_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664] ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784] +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568] +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600] +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664] +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -631,9 +606,9 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> @@ -643,72 +618,72 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; SSE2-LABEL: constant_shift_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; SSE41-NEXT: psllw $5, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE41-NEXT: psllw $5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> @@ -764,14 +739,14 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; SSE-LABEL: splatconstant_shift_v16i8: ; SSE: # BB#0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vpsrlw $3, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0 +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <16 x i8> %shift diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll index d200abd5f8755..bb0cceed77207 100644 --- a/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -33,39 +33,30 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: var_shift_v8i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpextrd $1, %xmm2, %eax ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpextrd $1, %xmm3, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vmovd %xmm2, %edx -; AVX1-NEXT: vmovd %xmm3, %ecx -; AVX1-NEXT: shrl %cl, %edx -; AVX1-NEXT: vmovd %edx, %xmm4 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrd $2, %xmm2, %eax -; AVX1-NEXT: vpextrd $2, %xmm3, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4 -; AVX1-NEXT: vpextrd $3, %xmm2, %eax -; AVX1-NEXT: vpextrd $3, %xmm3, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2 -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %edx -; AVX1-NEXT: vmovd %edx, %xmm3 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; AVX1-NEXT: shrl %cl, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -167,17 +158,17 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; ; AVX2-LABEL: var_shift_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shift = lshr <32 x i8> %a, %b @@ -334,32 +325,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; AVX1-LABEL: constant_shift_v8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: shrl $9, %eax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: shrl $8, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: shrl $8, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: shrl $7, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: shrl $5, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: shrl $6, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: shrl $7, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 +; AVX1-NEXT: vpsrld $4, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrld $7, %xmm0, %xmm2 +; AVX1-NEXT: vpsrld $9, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i32: @@ -453,18 +432,18 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; ; AVX2-LABEL: constant_shift_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> @@ -540,8 +519,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; ; AVX2-LABEL: splatconstant_shift_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlw $3, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0 +; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <32 x i8> %shift diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index 3ac31ea636765..6dbd9eab2a72e 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -12,26 +12,26 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq %xmm3, %xmm2 -; SSE2-NEXT: psllq %xmm1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: psllq %xmm3, %xmm2 +; SSE2-NEXT: psllq %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE2-NEXT: movapd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: psllq %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: psllq %xmm1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; @@ -46,33 +46,33 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: var_shift_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i32: @@ -86,84 +86,84 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: var_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: psllw $12, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psllw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psllw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psllw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psllw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psllw $2, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $2, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psllw $1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $1, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -171,9 +171,9 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %shift = shl <8 x i16> %a, %b @@ -183,69 +183,69 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: var_shift_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq ; ; SSE41-LABEL: var_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psllw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psllw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %shift = shl <16 x i8> %a, %b @@ -300,10 +300,10 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: splatvar_shift_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movzwl %ax, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psllw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v8i16: @@ -327,95 +327,95 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] -; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm1 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psllw $4, %xmm4 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psllw $4, %xmm4 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psllw $2, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $2, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v16i8: ; AVX2: # BB#0: ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer %shift = shl <16 x i8> %a, %splat @@ -430,24 +430,24 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # BB#0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq $7, %xmm1 -; SSE2-NEXT: psllq $1, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: psllq $7, %xmm1 +; SSE2-NEXT: psllq $1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i64: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $7, %xmm1 -; SSE41-NEXT: psllq $1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq $7, %xmm1 +; SSE41-NEXT: psllq $1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; @@ -462,13 +462,13 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; SSE2-LABEL: constant_shift_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; @@ -507,69 +507,69 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; SSE2-LABEL: constant_shift_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v16i8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; SSE41-NEXT: psllw $5, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE41-NEXT: psllw $5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $2, %xmm2 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllw $2, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: paddb %xmm2, %xmm2 -; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: paddb %xmm2, %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> @@ -625,14 +625,14 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; SSE-LABEL: splatconstant_shift_v16i8: ; SSE: # BB#0: -; SSE-NEXT: psllw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vpsllw $3, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0 +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> ret <16 x i8> %shift diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll index 7c13c0ae4716d..b287875f65417 100644 --- a/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/test/CodeGen/X86/vector-shift-shl-256.ll @@ -193,7 +193,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX2-LABEL: splatvar_shift_v8i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer @@ -341,7 +341,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm4 # xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 ; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $2, %xmm1, %xmm2 diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll new file mode 100644 index 0000000000000..26062335cc168 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -0,0 +1,221 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2 + +; +; EXTRQI +; + +define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu: +; BTVER1: # BB#0: +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz: +; BTVER1: # BB#0: +; BTVER1-NEXT: movaps %xmm0, %xmm1 +; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu: +; BTVER1: # BB#0: +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz: +; BTVER1: # BB#0: +; BTVER1-NEXT: movaps %xmm0, %xmm1 +; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) { +; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s +} + +define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) { +; ALL-LABEL: shuf_1zzzuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) { +; ALL-LABEL: shuf_12zzuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) { +; ALL-LABEL: shuf_012zuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) { +; BTVER1-LABEL: shuf_0zzz1zzz: +; BTVER1: # BB#0: +; BTVER1-NEXT: movaps %xmm0, %xmm1 +; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0zzz1zzz: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8> + ret <8 x i16> %s +} + +define <4 x i32> @shuf_0z1z(<4 x i32> %a0) { +; BTVER1-LABEL: shuf_0z1z: +; BTVER1: # BB#0: +; BTVER1-NEXT: pxor %xmm1, %xmm1 +; BTVER1-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0z1z: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; BTVER2-NEXT: retq + %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 4> + ret <4 x i32> %s +} + +; +; INSERTQI +; + +define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) { +; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) { +; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) { +; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s +} + +define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0823uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0183uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0128uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0893uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_089Auuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_089uuuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %s +} diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index d2eef9af2a25e..2480e676cad08 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -223,15 +223,15 @@ entry: } define <16 x i8> @trunc16i64_const() { -; SSE-LABEL: trunc16i64_const -; SSE: # BB#0: # %entry -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: trunc16i64_const -; AVX: # BB#0: # %entry -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-LABEL: trunc16i64_const: +; SSE: # BB#0: # %entry +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc16i64_const: +; AVX: # BB#0: # %entry +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %0 = trunc <16 x i64> zeroinitializer to <16 x i8> diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index c64e17442675e..b119f5eb89f67 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -11,7 +11,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand .LCPI0_0(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_8i16_to_8i32: @@ -20,7 +20,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand .LCPI0_0(%rip), %xmm1 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_8i16_to_8i32: @@ -28,7 +28,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand .LCPI0_0(%rip), %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i16_to_8i32: @@ -156,7 +156,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pand .LCPI3_0(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_16i8_to_16i16: @@ -165,15 +165,15 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pand .LCPI3_0(%rip), %xmm1 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pand .LCPI3_0(%rip), %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_16i8_to_16i16: @@ -195,24 +195,24 @@ entry: define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE2-LABEL: load_zext_16i8_to_16i16: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pand .LCPI4_0(%rip), %xmm1 -; SSE2-NEXT: retq +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_16i8_to_16i16: -; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pand .LCPI4_0(%rip), %xmm1 -; SSSE3-NEXT: retq +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry @@ -239,24 +239,24 @@ entry: define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { ; SSE2-LABEL: load_zext_8i16_to_8i32: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand .LCPI5_0(%rip), %xmm1 -; SSE2-NEXT: retq +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_8i16_to_8i32: -; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand .LCPI5_0(%rip), %xmm1 -; SSSE3-NEXT: retq +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i16_to_8i32: ; SSE41: # BB#0: # %entry @@ -415,7 +415,7 @@ entry: define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { ; SSE2-LABEL: shuf_zext_8i8_to_8i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pand .LCPI9_0(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 diff --git a/test/CodeGen/X86/vector-zmov.ll b/test/CodeGen/X86/vector-zmov.ll index cf592b1e9f422..2986835590543 100644 --- a/test/CodeGen/X86/vector-zmov.ll +++ b/test/CodeGen/X86/vector-zmov.ll @@ -5,15 +5,16 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) { -; SSE-LABEL: load_zmov_4i32_to_0zzz: -; SSE: # BB#0: # %entry -; SSE-NEXT: movd (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE-LABEL: load_zmov_4i32_to_0zzz: +; SSE: # BB#0: # %entry +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: retq +; +; AVX-LABEL: load_zmov_4i32_to_0zzz: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq -; AVX-LABEL: load_zmov_4i32_to_0zzz: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovd (%rdi), %xmm0 -; AVX-NEXT: retq entry: %X = load <4 x i32>, <4 x i32>* %ptr %Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4> @@ -21,15 +22,16 @@ entry: } define <2 x i64> @load_zmov_2i64_to_0z(<2 x i64> *%ptr) { -; SSE-LABEL: load_zmov_2i64_to_0z: -; SSE: # BB#0: # %entry -; SSE-NEXT: movq (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE-LABEL: load_zmov_2i64_to_0z: +; SSE: # BB#0: # %entry +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: retq +; +; AVX-LABEL: load_zmov_2i64_to_0z: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: retq -; AVX-LABEL: load_zmov_2i64_to_0z: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovq (%rdi), %xmm0 -; AVX-NEXT: retq entry: %X = load <2 x i64>, <2 x i64>* %ptr %Y = shufflevector <2 x i64> %X, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> diff --git a/test/CodeGen/X86/visibility.ll b/test/CodeGen/X86/visibility.ll index 580c3dc9266d6..be7fd96f2dd99 100644 --- a/test/CodeGen/X86/visibility.ll +++ b/test/CodeGen/X86/visibility.ll @@ -2,13 +2,19 @@ @zed = external hidden constant i32 +define available_externally hidden void @baz() { + ret void +} + define hidden void @foo() nounwind { entry: call void @bar(i32* @zed) + call void @baz() ret void } declare hidden void @bar(i32*) ;CHECK: .hidden zed +;CHECK: .hidden baz ;CHECK: .hidden bar diff --git a/test/CodeGen/X86/vshift-3.ll b/test/CodeGen/X86/vshift-3.ll index 0bdb32fcb86e1..f368029e4b494 100644 --- a/test/CodeGen/X86/vshift-3.ll +++ b/test/CodeGen/X86/vshift-3.ll @@ -3,13 +3,12 @@ ; test vector shifts converted to proper SSE2 vector shifts when the shift ; amounts are the same. -; Note that x86 does have ashr +; Note that x86 does have ashr -; shift1a can't use a packed shift define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind { entry: ; CHECK-LABEL: shift1a: -; CHECK: sarl +; CHECK: psrad $31 %ashr = ashr <2 x i64> %val, < i64 32, i64 32 > store <2 x i64> %ashr, <2 x i64>* %dst ret void diff --git a/test/CodeGen/X86/webkit-jscc.ll b/test/CodeGen/X86/webkit-jscc.ll new file mode 100644 index 0000000000000..a58c53e024ec2 --- /dev/null +++ b/test/CodeGen/X86/webkit-jscc.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-windows-gnu -mcpu=corei7 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-windows-msvc -mcpu=corei7 < %s | FileCheck %s + +define webkit_jscc i32 @simple_jscall(i32 %a, i32 %b, i32 %c) { + %ab = add i32 %a, %b + %abc = add i32 %ab, %c + ret i32 %abc +} + +; 32-bit integers are only aligned to 4 bytes, even on x64. They are *not* +; promoted to i64. + +; CHECK: simple_jscall: +; CHECK: addl 8(%rsp), %eax +; CHECK-NEXT: addl 12(%rsp), %eax +; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll index 906f7cdafb958..c8646c6489a15 100644 --- a/test/CodeGen/X86/widen_conv-2.ll +++ b/test/CodeGen/X86/widen_conv-2.ll @@ -1,8 +1,9 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -; CHECK: {{cwtl|movswl}} -; CHECK: {{cwtl|movswl}} +; CHECK: psllq $48, %xmm0 +; CHECK: psrad $16, %xmm0 +; CHECK: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; sign extension v2i32 to v2i16 +; sign extension v2i16 to v2i32 define void @convert(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind { entry: diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll index f5ddc0eacc614..6f1bd7541231a 100644 --- a/test/CodeGen/X86/widen_load-2.ll +++ b/test/CodeGen/X86/widen_load-2.ll @@ -194,17 +194,9 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa ; CHECK-NEXT: movl (%[[PTR0]]), [[TMP1:%e[abcd]+x]] ; CHECK-NEXT: movl [[TMP1]], [[TMP2:.*]] ; CHECK-NEXT: pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]] -; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x -; CHECK-NEXT: shrl %e[[R0]]x -; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x -; CHECK-NEXT: shrl %e[[R1]]x -; CHECK-NEXT: movd %e[[R1]]x, %[[X1:xmm[0-9]+]] -; CHECK-NEXT: pinsrd $1, %e[[R0]]x, %[[X1]] -; CHECK-NEXT: pextrd $2, %[[X0]], %e[[R0:[abcd]]]x -; CHECK-NEXT: shrl %e[[R0]]x -; CHECK-NEXT: pinsrd $2, %e[[R0]]x, %[[X1]] -; CHECK-NEXT: pextrd $3, %[[X0]], %e[[R0:[abcd]]]x -; CHECK-NEXT: pinsrd $3, %e[[R0]]x, %[[X1]] +; CHECK-NEXT: movdqa %[[X0]], %[[X1:xmm[0-9]+]] +; CHECK-NEXT: psrld $1, %[[X1]] +; CHECK-NEXT: pblendw $192, %[[X0]], %[[X1]] ; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}}) ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X1]] ; CHECK-NEXT: pmovzxwq %[[X1]], %[[X3:xmm[0-9]+]] diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll index f235d2884d03b..3ee4723ce5f3a 100644 --- a/test/CodeGen/X86/win32-eh.ll +++ b/test/CodeGen/X86/win32-eh.ll @@ -32,16 +32,19 @@ eh.resume: ; CHECK-LABEL: _use_except_handler3: ; CHECK: pushl %ebp ; CHECK: movl %esp, %ebp +; CHECK: pushl %ebx +; CHECK: pushl %edi +; CHECK: pushl %esi ; CHECK: subl ${{[0-9]+}}, %esp -; CHECK: movl $-1, -4(%ebp) -; CHECK: movl $L__ehtable$use_except_handler3, -8(%ebp) -; CHECK: leal -16(%ebp), %[[node:[^ ,]*]] -; CHECK: movl $__except_handler3, -12(%ebp) +; CHECK: movl $-1, -16(%ebp) +; CHECK: movl $L__ehtable$use_except_handler3, -20(%ebp) +; CHECK: leal -28(%ebp), %[[node:[^ ,]*]] +; CHECK: movl $__except_handler3, -24(%ebp) ; CHECK: movl %fs:0, %[[next:[^ ,]*]] -; CHECK: movl %[[next]], -16(%ebp) +; CHECK: movl %[[next]], -28(%ebp) ; CHECK: movl %[[node]], %fs:0 ; CHECK: calll _may_throw_or_crash -; CHECK: movl -16(%ebp), %[[next:[^ ,]*]] +; CHECK: movl -28(%ebp), %[[next:[^ ,]*]] ; CHECK: movl %[[next]], %fs:0 ; CHECK: retl @@ -72,18 +75,18 @@ eh.resume: ; CHECK: pushl %ebp ; CHECK: movl %esp, %ebp ; CHECK: subl ${{[0-9]+}}, %esp -; CHECK: movl %esp, -24(%ebp) -; CHECK: movl $-2, -4(%ebp) +; CHECK: movl %esp, -36(%ebp) +; CHECK: movl $-2, -16(%ebp) ; CHECK: movl $L__ehtable$use_except_handler4, %[[lsda:[^ ,]*]] ; CHECK: xorl ___security_cookie, %[[lsda]] -; CHECK: movl %[[lsda]], -8(%ebp) -; CHECK: leal -16(%ebp), %[[node:[^ ,]*]] -; CHECK: movl $__except_handler4, -12(%ebp) +; CHECK: movl %[[lsda]], -20(%ebp) +; CHECK: leal -28(%ebp), %[[node:[^ ,]*]] +; CHECK: movl $__except_handler4, -24(%ebp) ; CHECK: movl %fs:0, %[[next:[^ ,]*]] -; CHECK: movl %[[next]], -16(%ebp) +; CHECK: movl %[[next]], -28(%ebp) ; CHECK: movl %[[node]], %fs:0 ; CHECK: calll _may_throw_or_crash -; CHECK: movl -16(%ebp), %[[next:[^ ,]*]] +; CHECK: movl -28(%ebp), %[[next:[^ ,]*]] ; CHECK: movl %[[next]], %fs:0 ; CHECK: retl @@ -115,20 +118,21 @@ catchall: ; CHECK: pushl %ebp ; CHECK: movl %esp, %ebp ; CHECK: subl ${{[0-9]+}}, %esp -; CHECK: movl %esp, -16(%ebp) -; CHECK: movl $-1, -4(%ebp) -; CHECK: leal -12(%ebp), %[[node:[^ ,]*]] -; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -8(%ebp) +; CHECK: movl %esp, -28(%ebp) +; CHECK: movl $-1, -16(%ebp) +; CHECK: leal -24(%ebp), %[[node:[^ ,]*]] +; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -20(%ebp) ; CHECK: movl %fs:0, %[[next:[^ ,]*]] -; CHECK: movl %[[next]], -12(%ebp) +; CHECK: movl %[[next]], -24(%ebp) ; CHECK: movl %[[node]], %fs:0 -; CHECK: movl $0, -4(%ebp) +; CHECK: movl $0, -16(%ebp) ; CHECK: calll _may_throw_or_crash -; CHECK: movl -12(%ebp), %[[next:[^ ,]*]] +; CHECK: movl -24(%ebp), %[[next:[^ ,]*]] ; CHECK: movl %[[next]], %fs:0 ; CHECK: retl ; CHECK: .section .xdata,"dr" +; CHECK: .align 4 ; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3: ; CHECK-NEXT: .long 429065506 ; CHECK-NEXT: .long 2 diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll index 2c62f4918a7f0..477b3144d9e73 100644 --- a/test/CodeGen/X86/win64_frame.ll +++ b/test/CodeGen/X86/win64_frame.ll @@ -100,8 +100,9 @@ define i32 @f8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) "no-frame-pointer-elim"=" alloca i32, i32 %a ; CHECK: movl %ecx, %eax - ; CHECK: leaq 15(,%rax,4), %rax - ; CHECK: andq $-16, %rax + ; CHECK: leaq 15(,%rax,4), %rcx + ; CHECK: movabsq $34359738352, %rax + ; CHECK: andq %rcx, %rax ; CHECK: callq __chkstk ; CHECK: subq %rax, %rsp diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll index 5848eddf4375f..8c91335d91a2b 100644 --- a/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -598,3 +598,42 @@ if.then.60: ; preds = %if.end.55 cleanup: ; preds = %if.then.60, %if.end.55, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %if.end, %entry ret void } + +; Make sure we do not insert unreachable code after noreturn function. +; Although this is not incorrect to insert such code, it is useless +; and it hurts the binary size. +; +; CHECK-LABEL: noreturn: +; DISABLE: pushq +; +; CHECK: testb %dil, %dil +; CHECK-NEXT: jne [[ABORT:LBB[0-9_]+]] +; +; CHECK: movl $42, %eax +; +; DISABLE-NEXT: popq +; +; CHECK-NEXT: retq +; +; CHECK: [[ABORT]]: ## %if.abort +; +; ENABLE: pushq +; +; CHECK: callq _abort +; ENABLE-NOT: popq +define i32 @noreturn(i8 signext %bad_thing) { +entry: + %tobool = icmp eq i8 %bad_thing, 0 + br i1 %tobool, label %if.end, label %if.abort + +if.abort: + tail call void @abort() #0 + unreachable + +if.end: + ret i32 42 +} + +declare void @abort() #0 + +attributes #0 = { noreturn nounwind } |