diff options
Diffstat (limited to 'test/CodeGen/X86')
65 files changed, 9322 insertions, 2715 deletions
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll index 414ccf469ebbd..634f66ad52dea 100644 --- a/test/CodeGen/X86/StackColoring.ll +++ b/test/CodeGen/X86/StackColoring.ll @@ -1,9 +1,10 @@ -; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR -; RUN: llc -mcpu=corei7 -no-stack-coloring=true < %s | FileCheck %s --check-prefix=NOCOLOR +; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR --check-prefix=CHECK +; RUN: llc -mcpu=corei7 -no-stack-coloring=true < %s | FileCheck %s --check-prefix=NOCOLOR --check-prefix=CHECK target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" +;CHECK-LABEL: myCall_w2: ;YESCOLOR: subq $144, %rsp ;NOCOLOR: subq $272, %rsp @@ -28,6 +29,7 @@ entry: } +;CHECK-LABEL: myCall2_no_merge ;YESCOLOR: subq $272, %rsp ;NOCOLOR: subq $272, %rsp @@ -56,6 +58,7 @@ bb3: ret i32 0 } +;CHECK-LABEL: myCall2_w2 ;YESCOLOR: subq $144, %rsp ;NOCOLOR: subq $272, %rsp @@ -82,12 +85,11 @@ bb2: bb3: ret i32 0 } + +;CHECK-LABEL: myCall_w4: ;YESCOLOR: subq $200, %rsp ;NOCOLOR: subq $408, %rsp - - - define i32 @myCall_w4(i32 %in) { entry: %a1 = alloca [14 x i8*], align 8 @@ -119,6 +121,7 @@ entry: ret i32 %t7 } +;CHECK-LABEL: myCall2_w4: ;YESCOLOR: subq $112, %rsp ;NOCOLOR: subq $400, %rsp @@ -158,6 +161,7 @@ bb3: } +;CHECK-LABEL: myCall2_noend: ;YESCOLOR: subq $144, %rsp ;NOCOLOR: subq $272, %rsp @@ -185,6 +189,7 @@ bb3: ret i32 0 } +;CHECK-LABEL: myCall2_noend2: ;YESCOLOR: subq $144, %rsp ;NOCOLOR: subq $272, %rsp define i32 @myCall2_noend2(i32 %in, i1 %d) { @@ -211,6 +216,7 @@ bb3: } +;CHECK-LABEL: myCall2_nostart: ;YESCOLOR: subq $144, %rsp ;NOCOLOR: subq $272, %rsp define i32 @myCall2_nostart(i32 %in, i1 %d) { @@ -236,6 +242,7 @@ bb3: } ; Adopt the test from Transforms/Inline/array_merge.ll' +;CHECK-LABEL: array_merge: ;YESCOLOR: subq $816, %rsp ;NOCOLOR: subq $1616, %rsp define void @array_merge() nounwind ssp { @@ -261,6 +268,7 @@ entry: ret void } +;CHECK-LABEL: func_phi_lifetime: ;YESCOLOR: subq $272, %rsp ;NOCOLOR: subq $272, %rsp define i32 @func_phi_lifetime(i32 %in, i1 %d) { @@ -297,8 +305,7 @@ bb3: } -;YESCOLOR-LABEL: multi_region_bb: -;NOCOLOR-LABEL: multi_region_bb: +;CHECK-LABEL: multi_region_bb: define void @multi_region_bb() nounwind ssp { entry: %A.i1 = alloca [100 x i32], align 4 @@ -323,10 +330,9 @@ entry: call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind ret void } - - ;YESCOLOR: subq $272, %rsp ;NOCOLOR: subq $272, %rsp + define i32 @myCall_end_before_begin(i32 %in, i1 %d) { entry: %a = alloca [17 x i8*], align 8 @@ -353,9 +359,8 @@ bb3: ; Regression test for PR15707. %buf1 and %buf2 should not be merged ; in this test case. -;YESCOLOR-LABEL: myCall_pr15707: +;CHECK-LABEL: myCall_pr15707: ;YESCOLOR: subq $200008, %rsp -;NOCOLOR-LABEL: myCall_pr15707: ;NOCOLOR: subq $200008, %rsp define void @myCall_pr15707() { %buf1 = alloca i8, i32 100000, align 16 @@ -374,8 +379,7 @@ define void @myCall_pr15707() { ; Check that we don't assert and crash even when there are allocas ; outside the declared lifetime regions. -;YESCOLOR-LABEL: bad_range: -;NOCOLOR-LABEL: bad_range: +;CHECK-LABEL: bad_range: define void @bad_range() nounwind ssp { entry: %A.i1 = alloca [100 x i32], align 4 @@ -400,8 +404,7 @@ block2: ; Check that we don't assert and crash even when there are usages ; of allocas which do not read or write outside the declared lifetime regions. -;YESCOLOR-LABEL: shady_range: -;NOCOLOR-LABEL: shady_range: +;CHECK-LABEL: shady_range: %struct.Klass = type { i32, i32 } diff --git a/test/CodeGen/X86/asm-mismatched-types.ll b/test/CodeGen/X86/asm-mismatched-types.ll new file mode 100644 index 0000000000000..97f9c0872f8f5 --- /dev/null +++ b/test/CodeGen/X86/asm-mismatched-types.ll @@ -0,0 +1,135 @@ +; RUN: llc -o - %s -no-integrated-as | FileCheck %s +target triple = "x86_64--" + +; Allow to specify any of the 8/16/32/64 register names interchangeably in +; constraints + +; Produced by C-programs like this: +; void foo(int p) { register int reg __asm__("r8") = p; +; __asm__ __volatile__("# REG: %0" : : "r" (reg)); } + +; CHECK-LABEL: reg64_as_32: +; CHECK: # REG: %r8d +define void @reg64_as_32(i32 %p) { + call void asm sideeffect "# REG: $0", "{r8}"(i32 %p) + ret void +} + +; CHECK-LABEL: reg64_as_32_float: +; CHECK: # REG: %r8d +define void @reg64_as_32_float(float %p) { + call void asm sideeffect "# REG: $0", "{r8}"(float %p) + ret void +} + +; CHECK-LABEL: reg64_as_16: +; CHECK: # REG: %r9w +define void @reg64_as_16(i16 %p) { + call void asm sideeffect "# REG: $0", "{r9}"(i16 %p) + ret void +} + +; CHECK-LABEL: reg64_as_8: +; CHECK: # REG: %bpl +define void @reg64_as_8(i8 %p) { + call void asm sideeffect "# REG: $0", "{rbp}"(i8 %p) + ret void +} + +; CHECK-LABEL: reg32_as_16: +; CHECK: # REG: %r15w +define void @reg32_as_16(i16 %p) { + call void asm sideeffect "# REG: $0", "{r15d}"(i16 %p) + ret void +} + +; CHECK-LABEL: reg32_as_8: +; CHECK: # REG: %r12b +define void @reg32_as_8(i8 %p) { + call void asm sideeffect "# REG: $0", "{r12d}"(i8 %p) + ret void +} + +; CHECK-LABEL: reg16_as_8: +; CHECK: # REG: %cl +define void @reg16_as_8(i8 %p) { + call void asm sideeffect "# REG: $0", "{cx}"(i8 %p) + ret void +} + +; CHECK-LABEL: reg32_as_64: +; CHECK: # REG: %rbp +define void @reg32_as_64(i64 %p) { + call void asm sideeffect "# REG: $0", "{ebp}"(i64 %p) + ret void +} + +; CHECK-LABEL: reg32_as_64_float: +; CHECK: # REG: %rbp +define void @reg32_as_64_float(double %p) { + call void asm sideeffect "# REG: $0", "{ebp}"(double %p) + ret void +} + +; CHECK-LABEL: reg16_as_64: +; CHECK: # REG: %r13 +define void @reg16_as_64(i64 %p) { + call void asm sideeffect "# REG: $0", "{r13w}"(i64 %p) + ret void +} + +; CHECK-LABEL: reg16_as_64_float: +; CHECK: # REG: %r13 +define void @reg16_as_64_float(double %p) { + call void asm sideeffect "# REG: $0", "{r13w}"(double %p) + ret void +} + +; CHECK-LABEL: reg8_as_64: +; CHECK: # REG: %rax +define void @reg8_as_64(i64 %p) { + call void asm sideeffect "# REG: $0", "{al}"(i64 %p) + ret void +} + +; CHECK-LABEL: reg8_as_64_float: +; CHECK: # REG: %rax +define void @reg8_as_64_float(double %p) { + call void asm sideeffect "# REG: $0", "{al}"(double %p) + ret void +} + +; CHECK-LABEL: reg16_as_32: +; CHECK: # REG: %r11d +define void @reg16_as_32(i32 %p) { + call void asm sideeffect "# REG: $0", "{r11w}"(i32 %p) + ret void +} + +; CHECK-LABEL: reg16_as_32_float: +; CHECK: # REG: %r11d +define void @reg16_as_32_float(float %p) { + call void asm sideeffect "# REG: $0", "{r11w}"(float %p) + ret void +} + +; CHECK-LABEL: reg8_as_32: +; CHECK: # REG: %r9d +define void @reg8_as_32(i32 %p) { + call void asm sideeffect "# REG: $0", "{r9b}"(i32 %p) + ret void +} + +; CHECK-LABEL: reg8_as_32_float: +; CHECK: # REG: %r9d +define void @reg8_as_32_float(float %p) { + call void asm sideeffect "# REG: $0", "{r9b}"(float %p) + ret void +} + +; CHECK-LABEL: reg8_as_16: +; CHECK: # REG: %di +define void @reg8_as_16(i16 %p) { + call void asm sideeffect "# REG: $0", "{dil}"(i16 %p) + ret void +} diff --git a/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll b/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll index 016e2d261eef6..c7e86f565eefa 100644 --- a/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll +++ b/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll @@ -1,10 +1,8 @@ -; RUN: not llc -no-integrated-as %s -o - 2> %t1 -; RUN: FileCheck %s < %t1 -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +; RUN: not llc -o /dev/null %s 2>&1 | FileCheck %s target triple = "x86_64--" ; CHECK: error: couldn't allocate output register for constraint '{ax}' define i128 @blup() { - %v = tail call i128 asm "", "={ax},0,~{dirflag},~{fpsr},~{flags}"(i128 0) + %v = tail call i128 asm "", "={ax},0"(i128 0) ret i128 %v } diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index e70d9f3ad521c..e5373c575c1ad 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -1,15 +1,5 @@ ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -define <16 x i32> @test1(i32* %x) { -; CHECK-LABEL: test1: -; CHECK: vmovd (%rdi), %xmm -; CHECK: vmovdqa32 -; CHECK: vpermt2d %zmm - %y = load i32, i32* %x, align 4 - %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4 - ret <16 x i32>%res -} - define <16 x i32> @test2(<16 x i32> %x) { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll index 9814a6108272a..c30fc909f09b5 100644 --- a/test/CodeGen/X86/avx512-fma-intrinsics.ll +++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -1,422 +1,675 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s -declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) -declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) -declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) - -define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_x86_vfmsubpd_z - ; CHECK: vfmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone - -define <8 x double> @test_mask_vfmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsub_pd - ; CHECK: vfmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind - ret <8 x double> %res -} +declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_ps_z ; CHECK: vfnmadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { ; CHECK-LABEL: test_mask_vfnmadd_ps ; CHECK: vfnmadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_pd_z ; CHECK: vfnmadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmadd_pd ; CHECK: vfnmadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmsubps_z ; CHECK: vfnmsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { ; CHECK-LABEL: test_mask_vfnmsub_ps ; CHECK: vfnmsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmsubpd_z ; CHECK: vfnmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmsub_pd ; CHECK: vfnmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubps_z ; CHECK: vfmaddsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { ; CHECK-LABEL: test_mask_fmaddsub_ps: ; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) + %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubpd_z ; CHECK: vfmaddsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmaddsub_pd ; CHECK: vfmaddsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind - ret <8 x double> %res -} - -define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_x86_vfmsubaddps_z - ; CHECK: vfmsubadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone - -define <16 x float> @test_mask_vfmsubadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_vfmsubadd_ps - ; CHECK: vfmsubadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind - ret <16 x float> %res -} - -define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK-LABEL: test_x86_vfmsubaddpd_z - ; CHECK: vfmsubadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone -define <8 x double> @test_mask_vfmsubadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsubadd_pd - ; CHECK: vfmsubadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind - ret <8 x double> %res +define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind ret <16 x float> %res } define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rne - ; CHECK: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtn - ; CHECK: vfmsub213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtp - ; CHECK: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtz - ; CHECK: vfmsub213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_current - ; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rne - ; CHECK: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtn - ; CHECK: vfmsub213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtp - ; CHECK: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind - ret <16 x float> %res -} - -define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtz - ; CHECK: vfmsub213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind ret <16 x float> %res } -define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_current - ; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xaa,0xc2] - %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind - ret <16 x float> %res +declare <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } +define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind ret <8 x double> %res } define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2] - %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind ret <8 x double> %res } + +define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231ps %zmm1, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +define <16 x float>@test_int_x86_avx512_mask_vfnmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm3 {%k1} +; CHECK-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll index d6926e2571abd..ed046de005cf6 100644 --- a/test/CodeGen/X86/avx512-fma.ll +++ b/test/CodeGen/X86/avx512-fma.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -fp-contract=fast | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=SKX ; CHECK-LABEL: test_x86_fmadd_ps_z ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 @@ -58,26 +59,129 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 ret <8 x double> %res } -define double @test_x86_fmsub_sd_z(double %a0, double %a1, double %a2) { +define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) { +; CHECK-LABEL: test_x86_fmsub_213: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %x = fmul double %a0, %a1 %res = fsub double %x, %a2 ret double %res } -;CHECK-LABEL: test132_br -;CHECK: vfmadd132ps LCP{{.*}}(%rip){1to16} -;CHECK: ret -define <16 x float> @test132_br(<16 x float> %a1, <16 x float> %a2) nounwind { +define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { +; CHECK-LABEL: test_x86_fmsub_213_m: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %a2 = load double , double *%a2_ptr + %x = fmul double %a0, %a1 + %res = fsub double %x, %a2 + ret double %res +} + +define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { +; CHECK-LABEL: test_x86_fmsub_231_m: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %a2 = load double , double *%a2_ptr + %x = fmul double %a0, %a2 + %res = fsub double %x, %a1 + ret double %res +} + +define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind { +; CHECK-LABEL: test231_br: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000> %b2 = fadd <16 x float> %b1, %a2 ret <16 x float> %b2 } -;CHECK-LABEL: test213_br -;CHECK: vfmadd213ps LCP{{.*}}(%rip){1to16} -;CHECK: ret define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { +; CHECK-LABEL: test213_br: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0 +; CHECK-NEXT: retq %b1 = fmul <16 x float> %a1, %a2 %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000> ret <16 x float> %b2 } + +;mask (a*c+b , a) +define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { +; CHECK-LABEL: test_x86_fmadd132_ps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2 +; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 +; CHECK-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq +; +; SKX-LABEL: test_x86_fmadd132_ps: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %xmm2, %k1 +; SKX-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq + %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 + %x = fmul <16 x float> %a0, %a2 + %y = fadd <16 x float> %x, %a1 + %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a0 + ret <16 x float> %res +} + +;mask (a*c+b , b) +define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { +; CHECK-LABEL: test_x86_fmadd231_ps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2 +; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 +; CHECK-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq +; +; SKX-LABEL: test_x86_fmadd231_ps: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %xmm2, %k1 +; SKX-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 + %x = fmul <16 x float> %a0, %a2 + %y = fadd <16 x float> %x, %a1 + %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1 + ret <16 x float> %res +} + +;mask (b*a+c , b) +define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { +; CHECK-LABEL: test_x86_fmadd213_ps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2 +; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1 +; CHECK-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq +; +; SKX-LABEL: test_x86_fmadd213_ps: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %xmm2, %k1 +; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 + %x = fmul <16 x float> %a1, %a0 + %y = fadd <16 x float> %x, %a2 + %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1 + ret <16 x float> %res +} + diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 0e32a1c280676..3fca5a89a6a48 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32) declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32) @@ -10,52 +10,60 @@ declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32) declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32) -;CHECK-LABEL: gather_mask_dps -;CHECK: kmovw -;CHECK: vgatherdps -;CHECK: vpadd -;CHECK: vscatterdps -;CHECK: ret define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dps: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_dpd -;CHECK: kmovw -;CHECK: vgatherdpd -;CHECK: vpadd -;CHECK: vscatterdpd -;CHECK: ret define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dpd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qps -;CHECK: kmovw -;CHECK: vgatherqps -;CHECK: vpadd -;CHECK: vscatterqps -;CHECK: ret define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qps: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qpd -;CHECK: kmovw -;CHECK: vgatherqpd -;CHECK: vpadd -;CHECK: vscatterqpd -;CHECK: ret define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qpd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4) @@ -74,162 +82,710 @@ declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i3 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32) declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32) -;CHECK-LABEL: gather_mask_dd -;CHECK: kmovw -;CHECK: vpgatherdd -;CHECK: vpadd -;CHECK: vpscatterdd -;CHECK: ret define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qd -;CHECK: kmovw -;CHECK: vpgatherqd -;CHECK: vpadd -;CHECK: vpscatterqd -;CHECK: ret define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qd: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_qq -;CHECK: kmovw -;CHECK: vpgatherqq -;CHECK: vpadd -;CHECK: vpscatterqq -;CHECK: ret define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_qq: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4) ret void } -;CHECK-LABEL: gather_mask_dq -;CHECK: kmovw -;CHECK: vpgatherdq -;CHECK: vpadd -;CHECK: vpscatterdq -;CHECK: ret define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_mask_dq: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} +; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4) ret void } - -;CHECK-LABEL: gather_mask_dpd_execdomain -;CHECK: vgatherdpd -;CHECK: vmovapd -;CHECK: ret define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { +; CHECK-LABEL: gather_mask_dpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) store <8 x double> %x, <8 x double>* %stbuf ret void } -;CHECK-LABEL: gather_mask_qpd_execdomain -;CHECK: vgatherqpd -;CHECK: vmovapd -;CHECK: ret define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { +; CHECK-LABEL: gather_mask_qpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) store <8 x double> %x, <8 x double>* %stbuf ret void } -;CHECK-LABEL: gather_mask_dps_execdomain -;CHECK: vgatherdps -;CHECK: vmovaps -;CHECK: ret define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) { +; CHECK-LABEL: gather_mask_dps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) ret <16 x float> %res; } -;CHECK-LABEL: gather_mask_qps_execdomain -;CHECK: vgatherqps -;CHECK: vmovaps -;CHECK: ret define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) { +; CHECK-LABEL: gather_mask_qps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) ret <8 x float> %res; } -;CHECK-LABEL: scatter_mask_dpd_execdomain -;CHECK: vmovapd -;CHECK: vscatterdpd -;CHECK: ret define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { - %x = load <8 x double>, <8 x double>* %src, align 64 +; CHECK-LABEL: scatter_mask_dpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm1 +; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} +; CHECK-NEXT: retq + %x = load <8 x double>, <8 x double>* %src, align 64 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4) ret void } -;CHECK-LABEL: scatter_mask_qpd_execdomain -;CHECK: vmovapd -;CHECK: vscatterqpd -;CHECK: ret define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_qpd_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovapd (%rdi), %zmm1 +; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = load <8 x double>, <8 x double>* %src, align 64 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4) ret void } -;CHECK-LABEL: scatter_mask_dps_execdomain -;CHECK: vmovaps -;CHECK: vscatterdps -;CHECK: ret define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) { +; CHECK-LABEL: scatter_mask_dps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = load <16 x float>, <16 x float>* %src, align 64 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4) ret void } -;CHECK-LABEL: scatter_mask_qps_execdomain -;CHECK: vmovaps -;CHECK: vscatterqps -;CHECK: ret define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) { - %x = load <8 x float>, <8 x float>* %src, align 32 +; CHECK-LABEL: scatter_mask_qps_execdomain: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: retq + %x = load <8 x float>, <8 x float>* %src, align 32 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4) ret void } -;CHECK-LABEL: gather_qps -;CHECK: kxnorw -;CHECK: vgatherqps -;CHECK: vpadd -;CHECK: vscatterqps -;CHECK: ret define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) { +; CHECK-LABEL: gather_qps: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} +; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} +; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4) %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4) ret void } -;CHECK-LABEL: prefetch -;CHECK: gatherpf0 -;CHECK: gatherpf1 -;CHECK: scatterpf0 -;CHECK: scatterpf1 -;CHECK: ret declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32); declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32); define void @prefetch(<8 x i64> %ind, i8* %base) { +; CHECK-LABEL: prefetch: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: retq call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0) call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1) call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0) call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1) ret void } + + +declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32) + +define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div2_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div2_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32) + +define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,0), %ymm0 {%k1} +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) + %res1 = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqps (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div4_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherqps (%rdi,%ymm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3div8_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32) + +define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32) + +define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %ymm0 {%k1} +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} +; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32) + +define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32) + +define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2} +; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,0), %xmm0 {%k1} +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4) + %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 0) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32) + +define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%ymm1,0), %ymm0 {%k1} +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) + %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 0) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32) + +define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm2 +; CHECK-NEXT: kmovw %k1, %k2 +; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} +; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,0), %ymm0 {%k1} +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) + %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 0) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32) + +define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,0) {%k2} +; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32) + +define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32) + +define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32) + +define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32) + +define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32) + +define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,0) {%k2} +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32) + +define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32) + +define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 0) + call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32) + +define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,0) {%k2} +; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32) + +define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,0) {%k2} +; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32) + +define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32) + +define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kxnorw %k2, %k2, %k2 +; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,0) {%k2} +; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32) + +define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32) + +define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32) + +define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4) + ret void +} + +declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32) + +define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,0) {%k1} +; CHECK-NEXT: kxnorw %k1, %k1, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 0) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4) + ret void +} + diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index a06cadaa3f5ab..b9f490b8a39af 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -489,19 +489,31 @@ declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double> } declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32) - define <16 x i32> @test_pabsd(<16 x i32> %a) { - ;CHECK: vpabsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0] - %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a, <16 x i32>zeroinitializer, i16 -1) - ret < 16 x i32> %res - } declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16) - define <8 x i64> @test_pabsq(<8 x i64> %a) { - ;CHECK: vpabsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0] - %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a, <8 x i64>zeroinitializer, i8 -1) - ret <8 x i64> %res - } - declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8) +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsd{{.*}}{%k1} +define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) { + %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsq{{.*}}{%k1} +define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) { + %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK: vptestmq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1] @@ -3013,3 +3025,146 @@ define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> % %res2 = add <8 x i64> %res, %res1 ret <8 x i64> %res2 } + +declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2d {{.*}}{%k1} +define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2pd {{.*}}{%k1} +define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { + %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2ps {{.*}}{%k1} +define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2q {{.*}}{%k1} +define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { + %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2d {{.*}}{%k1} {z} +define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { + %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2pd {{.*}}{%k1} {z} +define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { + %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2ps {{.*}}{%k1} {z} +define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { + %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + + +declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2q {{.*}}{%k1} {z} +define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2d {{.*}}{%k1} +; CHECK-NOT: {z} +define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefpd{{.*}}{%k1} +define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { + %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3) + %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefps{{.*}}{%k1} +define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { + %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2) + %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll deleted file mode 100644 index 7e9eda58737d1..0000000000000 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ /dev/null @@ -1,392 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX - -; CHECK-LABEL: test1: -; CHECK: vpermps -; CHECK: ret -define <16 x float> @test1(<16 x float> %a) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> - ret <16 x float> %c -} - -; CHECK-LABEL: test2: -; CHECK: vpermd -; CHECK: ret -define <16 x i32> @test2(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> - ret <16 x i32> %c -} - -; CHECK-LABEL: test3: -; CHECK: vpermq -; CHECK: ret -define <8 x i64> @test3(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1> - ret <8 x i64> %c -} - -; CHECK-LABEL: test4: -; CHECK: vpermpd -; CHECK: ret -define <8 x double> @test4(<8 x double> %a) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x double> %c -} - -; CHECK-LABEL: test5: -; CHECK: vpermt2pd -; CHECK: ret -define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - ret <8 x double> %c -} - -; CHECK-LABEL: test6: -; CHECK: vpermq $30 -; CHECK: ret -define <8 x i64> @test6(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4> - ret <8 x i64> %c -} - -; CHECK-LABEL: test7: -; CHECK: vpermt2q -; CHECK: ret -define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - ret <8 x i64> %c -} - -; CHECK-LABEL: test8: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x i32> %c -} - -; CHECK-LABEL: test9: -; CHECK: vpermt2ps -; CHECK: ret -define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x float> %c -} - -; CHECK-LABEL: test10: -; CHECK: vpermt2ps ( -; CHECK: ret -define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { - %c = load <16 x float>, <16 x float>* %b - %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x float> %d -} - -; CHECK-LABEL: test11: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { - %c = load <16 x i32>, <16 x i32>* %b - %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x i32> %d -} - -; CHECK-LABEL: test13 -; CHECK: vpermilps $177, %zmm -; CHECK: ret -define <16 x float> @test13(<16 x float> %a) { - %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> - ret <16 x float> %b -} - -; CHECK-LABEL: test14 -; CHECK: vpermilpd $203, %zmm -; CHECK: ret -define <8 x double> @test14(<8 x double> %a) { - %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7> - ret <8 x double> %b -} - -; CHECK-LABEL: test15 -; CHECK: vpshufd $177, %zmm -; CHECK: ret -define <16 x i32> @test15(<16 x i32> %a) { -; mask 1-0-3-2 = 10110001 = 0xb1 = 177 - %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> - ret <16 x i32> %b -} -; CHECK-LABEL: test16 -; CHECK: valignq $3, %zmm0, %zmm1 -; CHECK: ret -define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> - ret <8 x double> %c -} - -; CHECK-LABEL: test17 -; CHECK: vshufpd $19, %zmm1, %zmm0 -; CHECK: ret -define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef> - ret <8 x double> %c -} - -; CHECK-LABEL: test18 -; CHECK: vpunpckhdq %zmm -; CHECK: ret -define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> - ret <16 x i32> %b -} - -; CHECK-LABEL: test19 -; CHECK: vpunpckldq %zmm -; CHECK: ret -define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> - ret <16 x i32> %b -} - -; CHECK-LABEL: test20 -; CHECK: vpunpckhqdq %zmm -; CHECK: ret -define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { - %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> - ret <8 x i64> %b -} - -; CHECK-LABEL: test21 -; CHECK: vbroadcastsd %xmm0, %zmm -; CHECK: ret -define <8 x double> @test21(<8 x double> %a, <8 x double> %b) { - %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> - ret <8 x double> %shuffle -} - -; CHECK-LABEL: test22 -; CHECK: vpbroadcastq %xmm0, %zmm -; CHECK: ret -define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) { - %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> - ret <8 x i64> %shuffle -} - -; CHECK-LABEL: @test23 -; CHECK: vshufps -; CHECK: vshufps -; CHECK: ret -define <16 x i32> @test23(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test24 -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 25, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test25 -; CHECK: vshufps $52 -; CHECK: ret -define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind { -; mask - 0-1-3-0 00110100 = 0x34 = 52 - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 16, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test26 -; CHECK: vmovshdup -; CHECK: ret -define <16 x i32> @test26(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test27 -; CHECK: ret -define <16 x i32> @test27(<4 x i32>%a) { - %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %res -} - -; CHECK-LABEL: test28 -; CHECK: vpshufhw $177, %ymm -; CHECK: ret -define <16 x i16> @test28(<16 x i16> %a) { - %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32><i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14> - ret <16 x i16> %b -} - -; CHECK-LABEL: test29 -; CHECK: vunpcklps %zmm -; CHECK: ret -define <16 x float> @test29(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> - ret <16 x float> %b -} - -; CHECK-LABEL: @test30 -; CHECK: vshufps $144, %zmm -; CHECK: ret -define <16 x float> @test30(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30> - ret <16 x float> %b -} - -; CHECK-LABEL: test31 -; CHECK: valignd $3, %zmm0, %zmm1 -; CHECK: ret -define <16 x i32> @test31(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> - ret <16 x i32> %c -} - -; CHECK-LABEL: test32 -; CHECK: vshufpd $99, %zmm0, %zmm1 -; CHECK: ret -define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 1, i32 10, i32 2, i32 undef, i32 5, i32 15, i32 undef> - ret <8 x double> %c -} - -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind { -; CHECK-LABEL: test_vshuff64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5> - ret <8 x double> %res -} - -define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind { -; CHECK-LABEL: test_vshuff64x2_512_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: retq - %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5> - %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer - ret <8 x double> %res -} - -define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind { -; CHECK-LABEL: test_vshufi64x2_512_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-NEXT: vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1} -; CHECK-NEXT: retq - %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 4, i32 5> - %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x - ret <8 x i64> %res -} - -define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind { -; CHECK-LABEL: test_vshuff64x2_512_mem: -; CHECK: ## BB#0: -; CHECK-NEXT: vshuff64x2 $40, %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: retq - %x1 = load <8 x double>,<8 x double> *%ptr,align 1 - %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 0, i32 1> - ret <8 x double> %res -} - -define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind { -; CHECK-LABEL: test_vshuff32x4_512_mem: -; CHECK: ## BB#0: -; CHECK-NEXT: vshuff64x2 $20, %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: retq - %x1 = load <16 x float>,<16 x float> *%ptr,align 1 - %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> - ret <16 x float> %res -} - -define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind { -; CHECK-LABEL: test_align_v16i32_rr: -; CHECK: ## BB#0: -; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> - ret <16 x i32> %c -} - -define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind { -; CHECK-LABEL: test_align_v16i32_rm: -; CHECK: ## BB#0: -; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: retq - %a = load <16 x i32>, <16 x i32>* %a.ptr - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> - ret <16 x i32> %c -} - -define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind { -; CHECK-LABEL: test_align_v16i32_rm_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 -; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 -; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq -; -; CHECK-SKX-LABEL: test_align_v16i32_rm_mask: -; CHECK-SKX: ## BB#0: -; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1 -; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1 -; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-SKX-NEXT: retq - %a = load <16 x i32>, <16 x i32>* %a.ptr - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> - %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a - ret <16 x i32> %res -} - -define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind { -; CHECK-LABEL: test_align_v8f64_rr: -; CHECK: ## BB#0: -; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> - ret <8 x double> %c -} - -define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind { -; CHECK-LABEL: test_align_v18f64_rm: -; CHECK: ## BB#0: -; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: retq - %a = load <8 x double>, <8 x double>* %a.ptr - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> - ret <8 x double> %c -} - -define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind { -; CHECK-LABEL: test_align_v18f64_rm_mask: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1 -; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 -; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: retq -; -; CHECK-SKX-LABEL: test_align_v18f64_rm_mask: -; CHECK-SKX: ## BB#0: -; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1 -; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} -; CHECK-SKX-NEXT: retq - %a = load <8 x double>, <8 x double>* %a.ptr - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> - %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer - ret <8 x double> %res -} - diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 9ee0e09d1b7a2..9574c016ad509 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -893,6 +893,45 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16 ret <32 x i16> %res2 } +declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2w %zmm{{.*}}{%k1} +define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2w %zmm{{.*}}{%k1} {z} +define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { + %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2w %zmm{{.*}}{%k1} +define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_512 @@ -918,3 +957,43 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %res2 = add <32 x i16> %res, %res1 ret <32 x i16> %res2 } + +declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpshufb %zmm{{.*}}{%k1} +define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { + %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + %res2 = add <64 x i8> %res, %res1 + ret <64 x i8> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsw{{.*}}{%k1} +define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { + %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_512 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsb{{.*}}{%k1} +define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { + %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1) + %res2 = add <64 x i8> %res, %res1 + ret <64 x i8> %res2 +} + diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index cf8c32a48b6b0..0119d3945f4e8 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -612,248 +612,925 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone -declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone +declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmadd256_ps ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2] - %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind ret <8 x float> %res } -declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmadd128_ps ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } -declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) +declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) { ; CHECK-LABEL: test_mask_fmadd256_pd: ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) + %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) ret <4 x double> %res } -declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) +declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; CHECK-LABEL: test_mask_fmadd128_pd: ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) ret <2 x double> %res } -declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone - -define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsub256_ps - ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2] - %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind - ret <8 x float> %res -} - -declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone - -define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsub128_ps - ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind - ret <4 x float> %res -} - -declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone - -define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsub256_pd - ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind - ret <4 x double> %res -} - -declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone - -define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsub128_pd - ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind - ret <2 x double> %res -} - -declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone +define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z} +; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z} +; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + + +declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + + +declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmadd256_ps ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2] - %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind ret <8 x float> %res } -declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmadd128_ps ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } -declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone +declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmadd256_pd ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } -declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmadd128_pd ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } -declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone +declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmsub256_ps ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2] - %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind ret <8 x float> %res } -declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmsub128_ps ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } -declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone +declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmsub256_pd ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } -declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfnmsub128_pd ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } -declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) { ; CHECK-LABEL: test_mask_fmaddsub256_ps: ; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2] - %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) + %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) ret <8 x float> %res } -declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { ; CHECK-LABEL: test_mask_fmaddsub128_ps: ; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) + %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) ret <4 x float> %res } -declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone +declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmaddsub256_pd ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } -declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmaddsub128_pd ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } -declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone - -define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsubadd256_ps - ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2] - %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind - ret <8 x float> %res -} - -declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone - -define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsubadd128_ps - ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind - ret <4 x float> %res -} - -declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone - -define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsubadd256_pd - ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind - ret <4 x double> %res -} -declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone - -define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsubadd128_pd - ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind - ret <2 x double> %res +define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z} +; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2=fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2=fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2=fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2=fadd <8 x float> %res, %res1 + ret <8 x float> %res2 } -define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd - ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07] - %a2 = load <2 x double>, <2 x double>* %ptr_a2 - %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind - ret <2 x double> %res -} -declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone -define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) { - ; CHECK-LABEL: test_mask_vfmsubaddrm_pd - ; CHECK: vfmsubadd213pd (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07] - %a2 = load <8 x double>, <8 x double>* %ptr_a2, align 8 - %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind - ret <8 x double> %res -} define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmadd128_ps_r ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { ; CHECK-LABEL: test_mask_vfmadd128_ps_rz ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2] - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind ret <4 x float> %res } @@ -861,7 +1538,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] %a2 = load <4 x float>, <4 x float>* %ptr_a2 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } @@ -869,7 +1546,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind ret <4 x float> %res } @@ -877,7 +1554,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] %a2 = load <4 x float>, <4 x float>* %ptr_a2 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind ret <4 x float> %res } @@ -885,7 +1562,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind ret <4 x float> %res } @@ -897,7 +1574,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind ret <4 x float> %res } @@ -909,7 +1586,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind ret <4 x float> %res } @@ -921,7 +1598,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind ret <4 x float> %res } @@ -933,21 +1610,21 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 - %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind ret <4 x float> %res } define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmadd128_pd_r ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { ; CHECK-LABEL: test_mask_vfmadd128_pd_rz ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2] - %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind ret <2 x double> %res } @@ -955,7 +1632,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> % ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07] %a2 = load <2 x double>, <2 x double>* %ptr_a2 - %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind ret <2 x double> %res } @@ -963,21 +1640,21 @@ define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07] %a2 = load <2 x double>, <2 x double>* %ptr_a2 - %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind + %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind ret <2 x double> %res } define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_vfmadd256_pd_r ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { ; CHECK-LABEL: test_mask_vfmadd256_pd_rz ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2] - %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind + %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind ret <4 x double> %res } @@ -985,7 +1662,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> % ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07] %a2 = load <4 x double>, <4 x double>* %ptr_a2 - %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind ret <4 x double> %res } @@ -993,7 +1670,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07] %a2 = load <4 x double>, <4 x double>* %ptr_a2 - %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind + %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind ret <4 x double> %res } define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { @@ -2877,6 +3554,85 @@ define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16 ret <16 x i16> %res2 } +declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2w %xmm{{.*}}{%k1} +; CHECK-NOT: {z} +define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2w %xmm{{.*}}{%k1} {z} +define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { + %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2w %ymm{{.*}}{%k1} +define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2w %ymm{{.*}}{%k1} {z} +define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { + %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2w %xmm{{.*}}{%k1} +define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2w %ymm{{.*}}{%k1} +define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_128 @@ -2928,3 +3684,82 @@ define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %res2 = add <16 x i16> %res, %res1 ret <16 x i16> %res2 } + +declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpshufb %xmm{{.*}}{%k1} +define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { + %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + %res2 = add <16 x i8> %res, %res1 + ret <16 x i8> %res2 +} + +declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpshufb %ymm{{.*}}{%k1} +define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { + %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + %res2 = add <32 x i8> %res, %res1 + ret <32 x i8> %res2 +} + +declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsb{{.*}}{%k1} +define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) { + %res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) + %res2 = add <16 x i8> %res, %res1 + ret <16 x i8> %res2 +} + +declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsb{{.*}}{%k1} +define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) { + %res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1) + %res2 = add <32 x i8> %res, %res1 + ret <32 x i8> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsw{{.*}}{%k1} +define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) { + %res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsw{{.*}}{%k1} +define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) { + %res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index dfd4986b85c1e..fb7c93dc53b3a 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2794,4 +2794,213 @@ define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> % %res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) %res2 = add <4 x i64> %res, %res1 ret <4 x i64> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2d %xmm{{.*}}{%k1} +; CHECK-NOT: {z} +define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { + %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2d %xmm{{.*}}{%k1} {z} +define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { + %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2d %ymm{{.*}}{%k1} +; CHECK-NOT: {z} +define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { + %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermt2d {{.*}}{%k1} {z} +define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { + %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2pd %xmm{{.*}}{%k1} +define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { + %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2pd %ymm{{.*}}{%k1} +define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { + %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2ps %xmm{{.*}}{%k1} +define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { + %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpermi2ps %ymm{{.*}}{%k1} +define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { + %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsq{{.*}}{%k1} +define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { + %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsq{{.*}}{%k1} +define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { + %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsd{{.*}}{%k1} +define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { + %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vpabsd{{.*}}{%k1} +define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { + %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + + +declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefpd{{.*}}{%k1} +define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { + %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefpd{{.*}}{%k1} +define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { + %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefps{{.*}}{%k1} +define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { + %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) +; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256 +; CHECK-NOT: call +; CHECK: kmov +; CHECK: vscalefps{{.*}}{%k1} +define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { + %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 }
\ No newline at end of file diff --git a/test/CodeGen/X86/coff-weak.ll b/test/CodeGen/X86/coff-weak.ll new file mode 100644 index 0000000000000..369750147f292 --- /dev/null +++ b/test/CodeGen/X86/coff-weak.ll @@ -0,0 +1,9 @@ +; RUN: llc -function-sections -o - %s | FileCheck %s + +target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc" + +; CHECK: .section{{.*}}one_only +define linkonce_odr void @foo() { + ret void +} diff --git a/test/CodeGen/X86/commute-two-addr.ll b/test/CodeGen/X86/commute-two-addr.ll index 656c385e2bc7d..5b01e2f4e90d5 100644 --- a/test/CodeGen/X86/commute-two-addr.ll +++ b/test/CodeGen/X86/commute-two-addr.ll @@ -39,7 +39,7 @@ define %0 @t3(i32 %lb, i8 zeroext %has_lb, i8 zeroext %lb_inclusive, i32 %ub, i8 entry: ; DARWIN-LABEL: t3: ; DARWIN: shlq $32, %rcx -; DARWIN-NEXT: orq %rcx, %rax +; DARWIN-NEXT: leaq (%rax,%rcx), %rax ; DARWIN-NEXT: shll $8 ; DARWIN-NOT: leaq %tmp21 = zext i32 %lb to i64 diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll index 629a5572977f8..bb5e92f98c7d7 100644 --- a/test/CodeGen/X86/dllexport-x86_64.ll +++ b/test/CodeGen/X86/dllexport-x86_64.ll @@ -71,33 +71,33 @@ define weak_odr dllexport void @weak1() { @blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*) ; CHECK: .section .drectve -; WIN32: " /EXPORT:Var1,DATA" -; WIN32: " /EXPORT:Var2,DATA" -; WIN32: " /EXPORT:Var3,DATA" -; WIN32: " /EXPORT:WeakVar1,DATA" -; WIN32: " /EXPORT:WeakVar2,DATA" -; WIN32: " /EXPORT:f1" -; WIN32: " /EXPORT:f2" -; WIN32: " /EXPORT:lnk1" -; WIN32: " /EXPORT:lnk2" -; WIN32: " /EXPORT:weak1" -; WIN32: " /EXPORT:alias" -; WIN32: " /EXPORT:alias2" -; WIN32: " /EXPORT:alias3" -; WIN32: " /EXPORT:weak_alias" -; WIN32: " /EXPORT:blob_alias" -; MINGW: " -export:Var1,data" -; MINGW: " -export:Var2,data" -; MINGW: " -export:Var3,data" -; MINGW: " -export:WeakVar1,data" -; MINGW: " -export:WeakVar2,data" -; MINGW: " -export:f1" -; MINGW: " -export:f2" -; MINGW: " -export:lnk1" -; MINGW: " -export:lnk2" -; MINGW: " -export:weak1" -; MINGW: " -export:alias" -; MINGW: " -export:alias2" -; MINGW: " -export:alias3" -; MINGW: " -export:weak_alias" -; MINGW: " -export:blob_alias" +; WIN32: /EXPORT:f1 +; WIN32-SAME: /EXPORT:f2 +; WIN32-SAME: /EXPORT:lnk1 +; WIN32-SAME: /EXPORT:lnk2 +; WIN32-SAME: /EXPORT:weak1 +; WIN32-SAME: /EXPORT:Var1,DATA +; WIN32-SAME: /EXPORT:Var2,DATA +; WIN32-SAME: /EXPORT:Var3,DATA +; WIN32-SAME: /EXPORT:WeakVar1,DATA +; WIN32-SAME: /EXPORT:WeakVar2,DATA +; WIN32-SAME: /EXPORT:alias +; WIN32-SAME: /EXPORT:alias2 +; WIN32-SAME: /EXPORT:alias3 +; WIN32-SAME: /EXPORT:weak_alias +; WIN32-SAME: /EXPORT:blob_alias +; MINGW: -export:f1 +; MINGW-SAME: -export:f2 +; MINGW-SAME: -export:lnk1 +; MINGW-SAME: -export:lnk2 +; MINGW-SAME: -export:weak1 +; MINGW-SAME: -export:Var1,data +; MINGW-SAME: -export:Var2,data +; MINGW-SAME: -export:Var3,data +; MINGW-SAME: -export:WeakVar1,data +; MINGW-SAME: -export:WeakVar2,data +; MINGW-SAME: -export:alias +; MINGW-SAME: -export:alias2 +; MINGW-SAME: -export:alias3 +; MINGW-SAME: -export:weak_alias +; MINGW-SAME: -export:blob_alias" diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll index 02a83ae7b191d..915567de5bf77 100644 --- a/test/CodeGen/X86/dllexport.ll +++ b/test/CodeGen/X86/dllexport.ll @@ -89,40 +89,41 @@ define weak_odr dllexport void @weak1() { @weak_alias = weak_odr dllexport alias void()* @f1 ; CHECK: .section .drectve -; CHECK-CL: " /EXPORT:_Var1,DATA" -; CHECK-CL: " /EXPORT:_Var2,DATA" -; CHECK-CL: " /EXPORT:_Var3,DATA" -; CHECK-CL: " /EXPORT:_WeakVar1,DATA" -; CHECK-CL: " /EXPORT:_WeakVar2,DATA" -; CHECK-CL: " /EXPORT:_f1" -; CHECK-CL: " /EXPORT:_f2" ; CHECK-CL-NOT: not_exported -; CHECK-CL: " /EXPORT:_stdfun@0" -; CHECK-CL: " /EXPORT:@fastfun@0" -; CHECK-CL: " /EXPORT:_thisfun" -; CHECK-CL: " /EXPORT:_lnk1" -; CHECK-CL: " /EXPORT:_lnk2" -; CHECK-CL: " /EXPORT:_weak1" -; CHECK-CL: " /EXPORT:_alias" -; CHECK-CL: " /EXPORT:_alias2" -; CHECK-CL: " /EXPORT:_alias3" -; CHECK-CL: " /EXPORT:_weak_alias" -; CHECK-GCC: " -export:Var1,data" -; CHECK-GCC: " -export:Var2,data" -; CHECK-GCC: " -export:Var3,data" -; CHECK-GCC: " -export:WeakVar1,data" -; CHECK-GCC: " -export:WeakVar2,data" -; CHECK-GCC: " -export:f1" -; CHECK-GCC: " -export:f2" +; CHECK-CL: /EXPORT:_f1 +; CHECK-CL-SAME: /EXPORT:_f2 +; CHECK-CL-SAME: /EXPORT:_stdfun@0 +; CHECK-CL-SAME: /EXPORT:@fastfun@0 +; CHECK-CL-SAME: /EXPORT:_thisfun +; CHECK-CL-SAME: /EXPORT:_lnk1 +; CHECK-CL-SAME: /EXPORT:_lnk2 +; CHECK-CL-SAME: /EXPORT:_weak1 +; CHECK-CL-SAME: /EXPORT:_Var1,DATA +; CHECK-CL-SAME: /EXPORT:_Var2,DATA +; CHECK-CL-SAME: /EXPORT:_Var3,DATA +; CHECK-CL-SAME: /EXPORT:_WeakVar1,DATA +; CHECK-CL-SAME: /EXPORT:_WeakVar2,DATA +; CHECK-CL-SAME: /EXPORT:_alias +; CHECK-CL-SAME: /EXPORT:_alias2 +; CHECK-CL-SAME: /EXPORT:_alias3 +; CHECK-CL-SAME: /EXPORT:_weak_alias" ; CHECK-CL-NOT: not_exported -; CHECK-GCC: " -export:stdfun@0" -; CHECK-GCC: " -export:@fastfun@0" -; CHECK-GCC: " -export:thisfun" -; CHECK-GCC: " -export:lnk1" -; CHECK-GCC: " -export:lnk2" -; CHECK-GCC: " -export:weak1" -; CHECK-GCC: " -export:alias" -; CHECK-GCC: " -export:alias2" -; CHECK-GCC: " -export:alias3" -; CHECK-GCC: " -export:weak_alias" - +; CHECK-GCC-NOT: not_exported +; CHECK-GCC: -export:f1 +; CHECK-GCC-SAME: -export:f2 +; CHECK-GCC-SAME: -export:stdfun@0 +; CHECK-GCC-SAME: -export:@fastfun@0 +; CHECK-GCC-SAME: -export:thisfun +; CHECK-GCC-SAME: -export:lnk1 +; CHECK-GCC-SAME: -export:lnk2 +; CHECK-GCC-SAME: -export:weak1 +; CHECK-GCC-SAME: -export:Var1,data +; CHECK-GCC-SAME: -export:Var2,data +; CHECK-GCC-SAME: -export:Var3,data +; CHECK-GCC-SAME: -export:WeakVar1,data +; CHECK-GCC-SAME: -export:WeakVar2,data +; CHECK-GCC-SAME: -export:alias +; CHECK-GCC-SAME: -export:alias2 +; CHECK-GCC-SAME: -export:alias3 +; CHECK-GCC-SAME: -export:weak_alias" +; CHECK-GCC-NOT: not_exported diff --git a/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll b/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll new file mode 100644 index 0000000000000..f7d0cdf3c65a1 --- /dev/null +++ b/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll @@ -0,0 +1,204 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s + +; CHECK-LABEL: fmaddsubpd_loop: +; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddpd_loop: +; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmaddpd_loop: +; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubpd_loop: +; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + + +; CHECK-LABEL: fmaddsubps_loop: +; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddps_loop: +; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmaddps_loop: +; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubps_loop: +; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll new file mode 100644 index 0000000000000..881436386bac8 --- /dev/null +++ b/test/CodeGen/X86/fma-intrinsics-x86.ll @@ -0,0 +1,493 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA + +; VFMADD +define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMSUB +define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFNMADD +define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFNMSUB +define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) + +define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMADDSUB +define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + +; VFMSUBADD +define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) + +define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) + +define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) + +define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) + +attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/fma-intrinsics-x86_64.ll b/test/CodeGen/X86/fma-intrinsics-x86_64.ll deleted file mode 100644 index aadd7311bb89e..0000000000000 --- a/test/CodeGen/X86/fma-intrinsics-x86_64.ll +++ /dev/null @@ -1,278 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK - -; VFMADD -define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfmaddss - ; CHECK-FMA: vfmadd213ss - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfmaddsd - ; CHECK-FMA: vfmadd213sd - %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfmaddps - ; CHECK-FMA: vfmadd213ps - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfmaddpd - ; CHECK-FMA: vfmadd213pd - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK-FMA4: vfmaddps - ; CHECK-FMA: vfmadd213ps - ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) - ret < 8 x float > %res -} -declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone - -define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK-FMA4: vfmaddpd - ; CHECK-FMA: vfmadd213pd - ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) - ret < 4 x double > %res -} -declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone - -; VFMSUB -define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfmsubss - ; CHECK-FMA: vfmsub213ss - %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfmsubsd - ; CHECK-FMA: vfmsub213sd - %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfmsubps - ; CHECK-FMA: vfmsub213ps - %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfmsubpd - ; CHECK-FMA: vfmsub213pd - %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK-FMA4: vfmsubps - ; CHECK-FMA: vfmsub213ps - ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) - ret < 8 x float > %res -} -declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone - -define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK-FMA4: vfmsubpd - ; CHECK-FMA: vfmsub213pd - ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) - ret < 4 x double > %res -} -declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone - -; VFNMADD -define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfnmaddss - ; CHECK-FMA: vfnmadd213ss - %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfnmaddsd - ; CHECK-FMA: vfnmadd213sd - %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfnmaddps - ; CHECK-FMA: vfnmadd213ps - %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfnmaddpd - ; CHECK-FMA: vfnmadd213pd - %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK-FMA4: vfnmaddps - ; CHECK-FMA: vfnmadd213ps - ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) - ret < 8 x float > %res -} -declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone - -define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK-FMA4: vfnmaddpd - ; CHECK-FMA: vfnmadd213pd - ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) - ret < 4 x double > %res -} -declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone - -; VFNMSUB -define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfnmsubss - ; CHECK-FMA: vfnmsub213ss - %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfnmsubsd - ; CHECK-FMA: vfnmsub213sd - %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfnmsubps - ; CHECK-FMA: vfnmsub213ps - %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfnmsubpd - ; CHECK-FMA: vfnmsub213pd - %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK-FMA4: vfnmsubps - ; CHECK-FMA: vfnmsub213ps - ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) - ret < 8 x float > %res -} -declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone - -define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK-FMA4: vfnmsubpd - ; CHECK-FMA: vfnmsub213pd - ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) - ret < 4 x double > %res -} -declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone - -; VFMADDSUB -define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfmaddsubps - ; CHECK-FMA: vfmaddsub213ps - %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfmaddsubpd - ; CHECK-FMA: vfmaddsub213pd - %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK-FMA4: vfmaddsubps - ; CHECK-FMA: vfmaddsub213ps - ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) - ret < 8 x float > %res -} -declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone - -define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK-FMA4: vfmaddsubpd - ; CHECK-FMA: vfmaddsub213pd - ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) - ret < 4 x double > %res -} -declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone - -; VFMSUBADD -define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK-FMA4: vfmsubaddps - ; CHECK-FMA: vfmsubadd213ps - %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) - ret < 4 x float > %res -} -declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone - -define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK-FMA4: vfmsubaddpd - ; CHECK-FMA: vfmsubadd213pd - %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) - ret < 2 x double > %res -} -declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone - -define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK-FMA4: vfmsubaddps - ; CHECK-FMA: vfmsubadd213ps - ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) - ret < 8 x float > %res -} -declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone - -define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK-FMA4: vfmsubaddpd - ; CHECK-FMA: vfmsubadd213pd - ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) - ret < 4 x double > %res -} -declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll index 9715bc7b328b7..34acdfe830f0e 100644 --- a/test/CodeGen/X86/fma-phi-213-to-231.ll +++ b/test/CodeGen/X86/fma-phi-213-to-231.ll @@ -1,246 +1,37 @@ -; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.10.0" - -; CHECK-LABEL: fmaddsubpd_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <4 x double> %c.addr.0 -} - -; CHECK-LABEL: fmsubaddpd_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <4 x double> %c.addr.0 -} - -; CHECK-LABEL: fmaddpd_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <4 x double> %c.addr.0 -} - -; CHECK-LABEL: fmsubpd_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <4 x double> %c.addr.0 -} - -declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) -declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) -declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) -declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) - - -; CHECK-LABEL: fmaddsubps_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <8 x float> %c.addr.0 -} - -; CHECK-LABEL: fmsubaddps_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <8 x float> %c.addr.0 -} - -; CHECK-LABEL: fmaddps_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <8 x float> %c.addr.0 -} - -; CHECK-LABEL: fmsubps_loop -; CHECK: [[BODYLBL:LBB.+]]: -; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -; CHECK: [[INCLBL:LBB.+]]: -; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] -; CHECK: cmpl {{%.+}}, [[INDREG]] -; CHECK: jl [[BODYLBL]] -define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { -entry: - br label %for.cond - -for.cond: - %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] - %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp = icmp slt i32 %i.0, %iter - br i1 %cmp, label %for.body, label %for.end - -for.body: - br label %for.inc - -for.inc: - %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) - %inc = add nsw i32 %i.0, 1 - br label %for.cond - -for.end: - ret <8 x float> %c.addr.0 -} - -declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) -declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) -declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) -declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s +; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s + +; Test FMA3 variant selection + +; CHECK-LABEL: fma3_select231ssX: +; CHECK: vfmadd231ss %xmm +define float @fma3_select231ssX(float %x, float %y) { +entry: + br label %while.body +while.body: + %acc.01 = phi float [ 0.000000e+00, %entry ], [ %acc, %while.body ] + %acc = call float @llvm.fma.f32(float %x, float %y, float %acc.01) + %b = fcmp ueq float %acc, 0.0 + br i1 %b, label %while.body, label %while.end +while.end: + ret float %acc +} + +; CHECK-LABEL: fma3_select231pdY: +; CHECK: vfmadd231pd %ymm +define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) { +entry: + br label %while.body +while.body: + %acc.04 = phi <4 x double> [ zeroinitializer, %entry ], [ %add, %while.body ] + %add = call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %acc.04) + %vecext = extractelement <4 x double> %add, i32 0 + %cmp = fcmp oeq double %vecext, 0.000000e+00 + br i1 %cmp, label %while.body, label %while.end +while.end: + ret <4 x double> %add +} + +declare float @llvm.fma.f32(float, float, float) +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll index 2eb152b078eff..b91479cda8715 100644 --- a/test/CodeGen/X86/fma.ll +++ b/test/CodeGen/X86/fma.ll @@ -1,80 +1,47 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST -; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL -; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST -; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL - -; CHECK: test_f32 +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+avx512f,-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST +; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST +; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL + +; CHECK-LABEL: test_f32: ; CHECK-FMA-INST: vfmadd213ss ; CHECK-FMA-CALL: fmaf - -define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp { +define float @test_f32(float %a, float %b, float %c) #0 { entry: - %call = tail call float @llvm.fma.f32(float %a, float %b, float %c) nounwind readnone + %call = call float @llvm.fma.f32(float %a, float %b, float %c) ret float %call } -; CHECK: test_f64 +; CHECK-LABEL: test_f64: ; CHECK-FMA-INST: vfmadd213sd ; CHECK-FMA-CALL: fma - -define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp { +define double @test_f64(double %a, double %b, double %c) #0 { entry: - %call = tail call double @llvm.fma.f64(double %a, double %b, double %c) nounwind readnone + %call = call double @llvm.fma.f64(double %a, double %b, double %c) ret double %call } -; CHECK: test_f80 +; CHECK-LABEL: test_f80: ; CHECK: fmal - -define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp { +define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 { entry: - %call = tail call x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone + %call = call x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) ret x86_fp80 %call } -; CHECK: test_f32_cst -; CHECK-NOT: fma -define float @test_f32_cst() nounwind readnone ssp { +; CHECK-LABEL: test_f32_cst: +; CHECK-NOT: vfmadd +define float @test_f32_cst() #0 { entry: - %call = tail call float @llvm.fma.f32(float 3.0, float 3.0, float 3.0) nounwind readnone + %call = call float @llvm.fma.f32(float 3.0, float 3.0, float 3.0) ret float %call } -; Test FMA3 variant selection -; CHECK-FMA-INST: fma3_select231ssX: -; CHECK-FMA-INST: vfmadd231ss %xmm -define float @fma3_select231ssX(float %x, float %y) #0 { -entry: - br label %while.body -while.body: ; preds = %while.body, %while.body - %acc.01 = phi float [ 0.000000e+00, %entry ], [ %acc, %while.body ] - %acc = tail call float @llvm.fma.f32(float %x, float %y, float %acc.01) nounwind readnone - %b = fcmp ueq float %acc, 0.0 - br i1 %b, label %while.body, label %while.end -while.end: ; preds = %while.body, %entry - ret float %acc -} - -; Test FMA3 variant selection -; CHECK-FMA-INST: fma3_select231pdY: -; CHECK-FMA-INST: vfmadd231pd %ymm -define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 { -entry: - br label %while.body -while.body: ; preds = %entry, %while.body - %acc.04 = phi <4 x double> [ zeroinitializer, %entry ], [ %add, %while.body ] - %add = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %acc.04) - %vecext = extractelement <4 x double> %add, i32 0 - %cmp = fcmp oeq double %vecext, 0.000000e+00 - br i1 %cmp, label %while.body, label %while.end - -while.end: ; preds = %while.body - ret <4 x double> %add -} +declare float @llvm.fma.f32(float, float, float) +declare double @llvm.fma.f64(double, double, double) +declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) -declare float @llvm.fma.f32(float, float, float) nounwind readnone -declare double @llvm.fma.f64(double, double, double) nounwind readnone -declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) nounwind readnone -declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll deleted file mode 100755 index fa9c252f30b46..0000000000000 --- a/test/CodeGen/X86/fma3-intrinsics.ll +++ /dev/null @@ -1,150 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s -; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s - -define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fmadd213ss (%r8), [[XMM1]], [[XMM0]] - %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmadd213ps - %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { - ; CHECK: fmadd213ps {{.*\(%r.*}}, %ymm - %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone - -define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fnmadd213ss (%r8), [[XMM1]], [[XMM0]] - %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmadd213ps - %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { - ; CHECK: fnmadd213ps {{.*\(%r.*}}, %ymm - %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone - - -define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fmsub213ss (%r8), [[XMM1]], [[XMM0]] - %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmsub213ps - %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fnmsub213ss (%r8), [[XMM1]], [[XMM0]] - %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmsub213ps - %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -;;;; - -define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fmadd213sd (%r8), [[XMM1]], [[XMM0]] - %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone - -define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmadd213pd - %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone - -define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fnmadd213sd (%r8), [[XMM1]], [[XMM0]] - %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone - -define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmadd213pd - %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone - - - -define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fmsub213sd (%r8), [[XMM1]], [[XMM0]] - %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone - -define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmsub213pd - %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone - -define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]] - ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]] - ; CHECK: fnmsub213sd (%r8), [[XMM1]], [[XMM0]] - %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone - -define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmsub213pd - %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone diff --git a/test/CodeGen/X86/fold-load-binops.ll b/test/CodeGen/X86/fold-load-binops.ll new file mode 100644 index 0000000000000..6d501c74fe57b --- /dev/null +++ b/test/CodeGen/X86/fold-load-binops.ll @@ -0,0 +1,142 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX + +; Verify that we're folding the load into the math instruction. +; This pattern is generated out of the simplest intrinsics usage: +; _mm_add_ss(a, _mm_load_ss(b)); + +define <4 x float> @addss(<4 x float> %va, float* %pb) { +; SSE-LABEL: addss: +; SSE: # BB#0: +; SSE-NEXT: addss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: addss: +; AVX: # BB#0: +; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fadd float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @addsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: addsd: +; SSE: # BB#0: +; SSE-NEXT: addsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: addsd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fadd double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +} + +define <4 x float> @subss(<4 x float> %va, float* %pb) { +; SSE-LABEL: subss: +; SSE: # BB#0: +; SSE-NEXT: subss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: subss: +; AVX: # BB#0: +; AVX-NEXT: vsubss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fsub float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @subsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: subsd: +; SSE: # BB#0: +; SSE-NEXT: subsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: subsd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fsub double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +} + +define <4 x float> @mulss(<4 x float> %va, float* %pb) { +; SSE-LABEL: mulss: +; SSE: # BB#0: +; SSE-NEXT: mulss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mulss: +; AVX: # BB#0: +; AVX-NEXT: vmulss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fmul float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @mulsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: mulsd: +; SSE: # BB#0: +; SSE-NEXT: mulsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mulsd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fmul double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +} + +define <4 x float> @divss(<4 x float> %va, float* %pb) { +; SSE-LABEL: divss: +; SSE: # BB#0: +; SSE-NEXT: divss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: divss: +; AVX: # BB#0: +; AVX-NEXT: vdivss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fdiv float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @divsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: divsd: +; SSE: # BB#0: +; SSE-NEXT: divsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: divsd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fdiv double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +} diff --git a/test/CodeGen/X86/fold-vector-sext-crash2.ll b/test/CodeGen/X86/fold-vector-sext-crash2.ll new file mode 100644 index 0000000000000..44c836195abc2 --- /dev/null +++ b/test/CodeGen/X86/fold-vector-sext-crash2.ll @@ -0,0 +1,92 @@ +; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32 +; RUN: llc < %s -march=x86-64 | FileCheck %s -check-prefix=X64 + +; DAGCombiner crashes during sext folding + +define <2 x i256> @test_sext1() { + %Se = sext <2 x i8> <i8 -100, i8 -99> to <2 x i256> + %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3> + ret <2 x i256> %Shuff + + ; X64-LABEL: test_sext1 + ; X64: movq $-1 + ; X64-NEXT: movq $-1 + ; X64-NEXT: movq $-1 + ; X64-NEXT: movq $-99 + + ; X32-LABEL: test_sext1 + ; X32: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-99 +} + +define <2 x i256> @test_sext2() { + %Se = sext <2 x i128> <i128 -2000, i128 -1999> to <2 x i256> + %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3> + ret <2 x i256> %Shuff + + ; X64-LABEL: test_sext2 + ; X64: movq $-1 + ; X64-NEXT: movq $-1 + ; X64-NEXT: movq $-1 + ; X64-NEXT: movq $-1999 + + ; X32-LABEL: test_sext2 + ; X32: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1999 +} + +define <2 x i256> @test_zext1() { + %Se = zext <2 x i8> <i8 -1, i8 -2> to <2 x i256> + %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3> + ret <2 x i256> %Shuff + + ; X64-LABEL: test_zext1 + ; X64: movq $0 + ; X64-NEXT: movq $0 + ; X64-NEXT: movq $0 + ; X64-NEXT: movq $254 + + ; X32-LABEL: test_zext1 + ; X32: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $254 +} + +define <2 x i256> @test_zext2() { + %Se = zext <2 x i128> <i128 -1, i128 -2> to <2 x i256> + %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3> + ret <2 x i256> %Shuff + + ; X64-LABEL: test_zext2 + ; X64: movq $0 + ; X64-NEXT: movq $0 + ; X64-NEXT: movq $-1 + ; X64-NEXT: movq $-2 + + ; X32-LABEL: test_zext2 + ; X32: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $0 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-1 + ; X32-NEXT: movl $-2 +} diff --git a/test/CodeGen/X86/fold-vector-shl-crash.ll b/test/CodeGen/X86/fold-vector-shl-crash.ll new file mode 100644 index 0000000000000..9f81e44074f1d --- /dev/null +++ b/test/CodeGen/X86/fold-vector-shl-crash.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s +; RUN: llc < %s -march=x86 | FileCheck %s + +;CHECK-LABEL: test +define <2 x i256> @test() { + %S = shufflevector <2 x i256> zeroinitializer, <2 x i256> <i256 -1, i256 -1>, <2 x i32> <i32 0, i32 2> + %B = shl <2 x i256> %S, <i256 -1, i256 -1> ; DAG Combiner crashes here + ret <2 x i256> %B +} diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll index 4f503af716a80..27af5738ca3e8 100644 --- a/test/CodeGen/X86/fp-fast.ll +++ b/test/CodeGen/X86/fp-fast.ll @@ -114,81 +114,3 @@ define float @test11(float %a) { ret float %t2 } -; Verify that the first two adds are independent regardless of how the inputs are -; commuted. The destination registers are used as source registers for the third add. - -define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds1: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %t1, %x3 - ret float %t2 -} - -define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds2: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %t1, %x3 - ret float %t2 -} - -define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds3: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %x3, %t1 - ret float %t2 -} - -define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { -; CHECK-LABEL: reassociate_adds4: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %x2, %t0 - %t2 = fadd float %x3, %t1 - ret float %t2 -} - -; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not -; produced because that would cost more compile time. - -define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { -; CHECK-LABEL: reassociate_adds5: -; CHECK: # BB#0: -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddss %xmm7, %xmm6, %xmm1 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq - %t0 = fadd float %x0, %x1 - %t1 = fadd float %t0, %x2 - %t2 = fadd float %t1, %x3 - %t3 = fadd float %t2, %x4 - %t4 = fadd float %t3, %x5 - %t5 = fadd float %t4, %x6 - %t6 = fadd float %t5, %x7 - ret float %t6 -} diff --git a/test/CodeGen/X86/implicit-null-check-negative.ll b/test/CodeGen/X86/implicit-null-check-negative.ll index e0210d9315f14..8fbed9f7bee85 100644 --- a/test/CodeGen/X86/implicit-null-check-negative.ll +++ b/test/CodeGen/X86/implicit-null-check-negative.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=x86_64-apple-macosx -O3 -debug-only=faultmaps -enable-implicit-null-checks < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-macosx -O3 -debug-only=faultmaps -enable-implicit-null-checks < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; List cases where we should *not* be emitting implicit null checks. @@ -10,7 +10,7 @@ define i32 @imp_null_check_load(i32* %x, i32* %y) { %c = icmp eq i32* %x, null ; It isn't legal to move the load from %x from "not_null" to here -- ; the store to %y could be aliasing it. - br i1 %c, label %is_null, label %not_null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 @@ -24,7 +24,7 @@ define i32 @imp_null_check_load(i32* %x, i32* %y) { define i32 @imp_null_check_gep_load(i32* %x) { entry: %c = icmp eq i32* %x, null - br i1 %c, label %is_null, label %not_null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 @@ -38,8 +38,7 @@ define i32 @imp_null_check_gep_load(i32* %x) { } define i32 @imp_null_check_load_no_md(i32* %x) { -; Everything is okay except that the !never.executed metadata is -; missing. +; This is fine, except it is missing the !make.implicit metadata. entry: %c = icmp eq i32* %x, null br i1 %c, label %is_null, label %not_null @@ -51,3 +50,5 @@ define i32 @imp_null_check_load_no_md(i32* %x) { %t = load i32, i32* %x ret i32 %t } + +!0 = !{} diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll index f4c539800fbbf..1d1b36bbd5d06 100644 --- a/test/CodeGen/X86/implicit-null-check.ll +++ b/test/CodeGen/X86/implicit-null-check.ll @@ -1,5 +1,15 @@ ; RUN: llc -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-implicit-null-checks \ +; RUN: | llvm-mc -triple x86_64-apple-macosx -filetype=obj -o - \ +; RUN: | llvm-objdump -triple x86_64-apple-macosx -fault-map-section - \ +; RUN: | FileCheck %s -check-prefix OBJDUMP + +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -enable-implicit-null-checks \ +; RUN: | llvm-mc -triple x86_64-unknown-linux-gnu -filetype=obj -o - \ +; RUN: | llvm-objdump -triple x86_64-unknown-linux-gnu -fault-map-section - \ +; RUN: | FileCheck %s -check-prefix OBJDUMP + define i32 @imp_null_check_load(i32* %x) { ; CHECK-LABEL: _imp_null_check_load: ; CHECK: Ltmp1: @@ -11,7 +21,7 @@ define i32 @imp_null_check_load(i32* %x) { entry: %c = icmp eq i32* %x, null - br i1 %c, label %is_null, label %not_null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 @@ -32,7 +42,7 @@ define i32 @imp_null_check_gep_load(i32* %x) { entry: %c = icmp eq i32* %x, null - br i1 %c, label %is_null, label %not_null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 @@ -55,7 +65,7 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) { entry: %c = icmp eq i32* %x, null - br i1 %c, label %is_null, label %not_null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 @@ -66,6 +76,8 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) { ret i32 %p1 } +!0 = !{} + ; CHECK-LABEL: __LLVM_FaultMaps: ; Version: @@ -116,3 +128,13 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) { ; CHECK-NEXT: .long Ltmp1-_imp_null_check_load ; Fault[0].HandlerOffset: ; CHECK-NEXT: .long Ltmp0-_imp_null_check_load + +; OBJDUMP: FaultMap table: +; OBJDUMP-NEXT: Version: 0x1 +; OBJDUMP-NEXT: NumFunctions: 3 +; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1 +; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5 +; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1 +; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7 +; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1 +; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 3 diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll new file mode 100644 index 0000000000000..d4cd59ffac3ac --- /dev/null +++ b/test/CodeGen/X86/machine-combiner.ll @@ -0,0 +1,99 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s + +; Verify that the first two adds are independent regardless of how the inputs are +; commuted. The destination registers are used as source registers for the third add. + +define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds1: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds2: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %x2, %t0 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds3: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %x3, %t1 + ret float %t2 +} + +define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds4: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %x2, %t0 + %t2 = fadd float %x3, %t1 + ret float %t2 +} + +; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not +; produced because that would cost more compile time. + +define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { +; CHECK-LABEL: reassociate_adds5: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1 +; CHECK-NEXT: vaddss %xmm6, %xmm1, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm7, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + %t3 = fadd float %t2, %x4 + %t4 = fadd float %t3, %x5 + %t5 = fadd float %t4, %x6 + %t6 = fadd float %t5, %x7 + ret float %t6 +} + +; Verify that we only need two associative operations to reassociate the operands. +; Also, we should reassociate such that the result of the high latency division +; is used by the final 'add' rather than reassociating the %x3 operand with the +; division. The latter reassociation would not improve anything. + +define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds6: +; CHECK: # BB#0: +; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv float %x0, %x1 + %t1 = fadd float %x2, %t0 + %t2 = fadd float %x3, %t1 + ret float %t2 +} + diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll index f89e52457f355..b02f9ec45e7fb 100644 --- a/test/CodeGen/X86/movtopush.ll +++ b/test/CodeGen/X86/movtopush.ll @@ -2,11 +2,15 @@ ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64 ; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED +%class.Class = type { i32 } +%struct.s = type { i64 } + declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) +declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32 %b, i32 %c, i32 %d) declare void @oneparam(i32 %a) declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) - +declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d) ; Here, we should have a reserved frame, so we don't expect pushes ; NORMAL-LABEL: test1: @@ -108,13 +112,12 @@ entry: ret void } -; We don't support weird calling conventions +; We support weird calling conventions ; NORMAL-LABEL: test4: -; NORMAL: subl $12, %esp -; NORMAL-NEXT: movl $4, 8(%esp) -; NORMAL-NEXT: movl $3, 4(%esp) -; NORMAL-NEXT: movl $1, (%esp) -; NORMAL-NEXT: movl $2, %eax +; NORMAL: movl $2, %eax +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: call ; NORMAL-NEXT: addl $12, %esp define void @test4() optsize { @@ -123,6 +126,20 @@ entry: ret void } +; NORMAL-LABEL: test4b: +; NORMAL: movl 4(%esp), %ecx +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: call +; NORMAL-NEXT: ret +define void @test4b(%class.Class* %f) optsize { +entry: + call x86_thiscallcc void @thiscall(%class.Class* %f, i32 1, i32 2, i32 3, i32 4) + ret void +} + ; When there is no reserved call frame, check that additional alignment ; is added when the pushes don't add up to the required alignment. ; ALIGNED-LABEL: test5: @@ -229,20 +246,27 @@ entry: ; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: call ; NORMAL-NEXT: addl $16, %esp -; NORMAL-NEXT: subl $16, %esp -; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]] -; NORMAL-NEXT: movl [[EAX]], 12(%esp) -; NORMAL-NEXT: movl $7, 8(%esp) -; NORMAL-NEXT: movl $6, 4(%esp) -; NORMAL-NEXT: movl $5, (%esp) +; NORMAL-NEXT: subl $20, %esp +; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]] +; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]] +; NORMAL-NEXT: movl [[E2]], 4(%esp) +; NORMAL-NEXT: movl [[E1]], (%esp) +; NORMAL-NEXT: leal 32(%esp), [[E3:%e..]] +; NORMAL-NEXT: movl [[E3]], 16(%esp) +; NORMAL-NEXT: leal 28(%esp), [[E4:%e..]] +; NORMAL-NEXT: movl [[E4]], 12(%esp) +; NORMAL-NEXT: movl $6, 8(%esp) ; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: addl $20, %esp define void @test9() optsize { entry: %p = alloca i32, align 4 + %q = alloca i32, align 4 + %s = alloca %struct.s, align 4 call void @good(i32 1, i32 2, i32 3, i32 4) - %0 = ptrtoint i32* %p to i32 - call void @good(i32 5, i32 6, i32 7, i32 %0) + %pv = ptrtoint i32* %p to i32 + %qv = ptrtoint i32* %q to i32 + call void @struct(%struct.s* byval %s, i32 6, i32 %qv, i32 %pv) ret void } @@ -291,28 +315,17 @@ define void @test11() optsize { ; Converting one mov into a push isn't worth it when ; doing so forces too much overhead for other calls. ; NORMAL-LABEL: test12: -; NORMAL: subl $16, %esp -; NORMAL-NEXT: movl $4, 8(%esp) -; NORMAL-NEXT: movl $3, 4(%esp) -; NORMAL-NEXT: movl $1, (%esp) -; NORMAL-NEXT: movl $2, %eax -; NORMAL-NEXT: calll _inreg -; NORMAL-NEXT: movl $8, 12(%esp) +; NORMAL: movl $8, 12(%esp) ; NORMAL-NEXT: movl $7, 8(%esp) ; NORMAL-NEXT: movl $6, 4(%esp) ; NORMAL-NEXT: movl $5, (%esp) ; NORMAL-NEXT: calll _good -; NORMAL-NEXT: movl $12, 8(%esp) -; NORMAL-NEXT: movl $11, 4(%esp) -; NORMAL-NEXT: movl $9, (%esp) -; NORMAL-NEXT: movl $10, %eax -; NORMAL-NEXT: calll _inreg -; NORMAL-NEXT: addl $16, %esp define void @test12() optsize { entry: - call void @inreg(i32 1, i32 2, i32 3, i32 4) + %s = alloca %struct.s, align 4 + call void @struct(%struct.s* %s, i32 2, i32 3, i32 4) call void @good(i32 5, i32 6, i32 7, i32 8) - call void @inreg(i32 9, i32 10, i32 11, i32 12) + call void @struct(%struct.s* %s, i32 10, i32 11, i32 12) ret void } @@ -324,13 +337,12 @@ entry: ; NORMAL-NEXT: pushl $1 ; NORMAL-NEXT: calll _good ; NORMAL-NEXT: addl $16, %esp -; NORMAL-NEXT: subl $12, %esp -; NORMAL-NEXT: movl $8, 8(%esp) -; NORMAL-NEXT: movl $7, 4(%esp) -; NORMAL-NEXT: movl $5, (%esp) -; NORMAL-NEXT: movl $6, %eax -; NORMAL-NEXT: calll _inreg -; NORMAL-NEXT: addl $12, %esp +; NORMAL-NEXT: subl $20, %esp +; NORMAL: movl $8, 16(%esp) +; NORMAL-NEXT: movl $7, 12(%esp) +; NORMAL-NEXT: movl $6, 8(%esp) +; NORMAL-NEXT: calll _struct +; NORMAL-NEXT: addl $20, %esp ; NORMAL-NEXT: pushl $12 ; NORMAL-NEXT: pushl $11 ; NORMAL-NEXT: pushl $10 @@ -339,8 +351,9 @@ entry: ; NORMAL-NEXT: addl $16, %esp define void @test12b() optsize { entry: - call void @good(i32 1, i32 2, i32 3, i32 4) - call void @inreg(i32 5, i32 6, i32 7, i32 8) + %s = alloca %struct.s, align 4 + call void @good(i32 1, i32 2, i32 3, i32 4) + call void @struct(%struct.s* %s, i32 6, i32 7, i32 8) call void @good(i32 9, i32 10, i32 11, i32 12) ret void } diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll index ae3ed3f8344a6..9db948adb4652 100644 --- a/test/CodeGen/X86/or-branch.ll +++ b/test/CodeGen/X86/or-branch.ll @@ -1,19 +1,28 @@ -; RUN: llc < %s -march=x86 | not grep set +; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 +; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind { +; JUMP2-LABEL: foo: +; JUMP2-DAG: jl +; JUMP2-DAG: je +; +; JUMP1-LABEL: foo: +; JUMP1-DAG: sete +; JUMP1-DAG: setl +; JUMP1: orb +; JUMP1: jne entry: - %tmp = tail call i32 (...) @bar( ) ; <i32> [#uses=0] - %tmp.upgrd.1 = icmp eq i32 %X, 0 ; <i1> [#uses=1] - %tmp3 = icmp slt i32 %Y, 5 ; <i1> [#uses=1] - %tmp4 = or i1 %tmp3, %tmp.upgrd.1 ; <i1> [#uses=1] - br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock + %tmp1 = icmp eq i32 %X, 0 + %tmp3 = icmp slt i32 %Y, 5 + %tmp4 = or i1 %tmp3, %tmp1 + br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock -cond_true: ; preds = %entry - %tmp5 = tail call i32 (...) @bar( ) ; <i32> [#uses=0] - ret void +cond_true: + %tmp5 = tail call i32 (...) @bar( ) + ret void -UnifiedReturnBlock: ; preds = %entry - ret void +UnifiedReturnBlock: + ret void } declare i32 @bar(...) diff --git a/test/CodeGen/X86/pr23900.ll b/test/CodeGen/X86/pr23900.ll new file mode 100644 index 0000000000000..cbc77161c0428 --- /dev/null +++ b/test/CodeGen/X86/pr23900.ll @@ -0,0 +1,29 @@ +; RUN: llc -filetype=obj %s -o %t.o +; RUN: llvm-nm %t.o | FileCheck %s + +; Test that it doesn't crash (and produces an object file). +; This use to pass a symbol with a null name to code that expected a valid +; C string. + +; CHECK: U __CxxFrameHandler3 +; CHECK: T f +; CHECK: t f.cleanup +; CHECK: U g +; CHECK: U h + + +target triple = "x86_64-pc-windows-msvc18.0.0" +define void @f(i32 %x) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { + invoke void @h() + to label %invoke.cont unwind label %lpad +invoke.cont: + ret void +lpad: + landingpad { i8*, i32 } + cleanup + call void @g(i32 %x) + ret void +} +declare void @h() +declare i32 @__CxxFrameHandler3(...) +declare void @g(i32 %x) diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll index 7f1521a83bcfd..8e02dad9d5aee 100644 --- a/test/CodeGen/X86/recip-fastmath.ll +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 -recip=!divf,!vec-divf | FileCheck %s --check-prefix=NORECIP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE @@ -14,11 +14,11 @@ define float @reciprocal_estimate(float %x) #0 { %div = fdiv fast float 1.0, %x ret float %div -; CHECK-LABEL: reciprocal_estimate: -; CHECK: movss -; CHECK-NEXT: divss -; CHECK-NEXT: movaps -; CHECK-NEXT: retq +; NORECIP-LABEL: reciprocal_estimate: +; NORECIP: movss +; NORECIP-NEXT: divss +; NORECIP-NEXT: movaps +; NORECIP-NEXT: retq ; RECIP-LABEL: reciprocal_estimate: ; RECIP: vrcpss @@ -45,11 +45,11 @@ define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 { %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x ret <4 x float> %div -; CHECK-LABEL: reciprocal_estimate_v4f32: -; CHECK: movaps -; CHECK-NEXT: divps -; CHECK-NEXT: movaps -; CHECK-NEXT: retq +; NORECIP-LABEL: reciprocal_estimate_v4f32: +; NORECIP: movaps +; NORECIP-NEXT: divps +; NORECIP-NEXT: movaps +; NORECIP-NEXT: retq ; RECIP-LABEL: reciprocal_estimate_v4f32: ; RECIP: vrcpps @@ -76,14 +76,14 @@ define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 { %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x ret <8 x float> %div -; CHECK-LABEL: reciprocal_estimate_v8f32: -; CHECK: movaps -; CHECK: movaps -; CHECK-NEXT: divps -; CHECK-NEXT: divps -; CHECK-NEXT: movaps -; CHECK-NEXT: movaps -; CHECK-NEXT: retq +; NORECIP-LABEL: reciprocal_estimate_v8f32: +; NORECIP: movaps +; NORECIP: movaps +; NORECIP-NEXT: divps +; NORECIP-NEXT: divps +; NORECIP-NEXT: movaps +; NORECIP-NEXT: movaps +; NORECIP-NEXT: retq ; RECIP-LABEL: reciprocal_estimate_v8f32: ; RECIP: vrcpps diff --git a/test/CodeGen/X86/rrlist-livereg-corrutpion.ll b/test/CodeGen/X86/rrlist-livereg-corrutpion.ll new file mode 100644 index 0000000000000..7191e0453a668 --- /dev/null +++ b/test/CodeGen/X86/rrlist-livereg-corrutpion.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s + +; CHECK-LABEL: test +define i64 @test(i64 %a, i256 %b, i1 %c) { + %u = zext i64 %a to i256 + %s = add i256 %u, 1 + %o = trunc i256 %s to i1 + %j = add i256 %s, 1 + %i = icmp ule i64 %a, 1 + %f = select i1 %o, i256 undef, i256 %j + %d = select i1 %i, i256 %f, i256 1 + %e = add i256 %b, 1 + %n = select i1 %c, i256 %e, i256 %b + %m = trunc i256 %n to i64 + %h = add i64 %m, 1 + %r = zext i64 %h to i256 + %v = lshr i256 %d, %r + %t = trunc i256 %v to i1 + %q = shl i256 1, %r + %p = and i256 %d, %q + %w = icmp ule i256 %n, 1 + %y = select i1 %t, i256 undef, i256 %p + %x = select i1 %w, i256 %y, i256 %d + %z = trunc i256 %x to i64 + ret i64 %z +} diff --git a/test/CodeGen/X86/sdiv-exact.ll b/test/CodeGen/X86/sdiv-exact.ll index 4f8d3f05351b2..a6ace5bc31c1a 100644 --- a/test/CodeGen/X86/sdiv-exact.ll +++ b/test/CodeGen/X86/sdiv-exact.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=x86 < %s | FileCheck %s +; RUN: llc -march=x86 -mattr=+sse2 < %s | FileCheck %s define i32 @test1(i32 %x) { %div = sdiv exact i32 %x, 25 @@ -16,3 +16,14 @@ define i32 @test2(i32 %x) { ; CHECK-NEXT: imull $-1431655765 ; CHECK-NEXT: ret } + +define <4 x i32> @test3(<4 x i32> %x) { + %div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 24, i32 24> + ret <4 x i32> %div +; CHECK-LABEL: test3: +; CHECK: psrad $3, +; CHECK: pmuludq +; CHECK: pmuludq +; CHECK-NOT: psrad +; CHECK: ret +} diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll index 28b0bca962ea8..423b9914e99d2 100644 --- a/test/CodeGen/X86/seh-catch-all-win32.ll +++ b/test/CodeGen/X86/seh-catch-all-win32.ll @@ -12,7 +12,7 @@ declare i32 @llvm.eh.typeid.for(i8*) declare i8* @llvm.frameaddress(i32) declare i8* @llvm.framerecover(i8*, i8*, i32) declare void @llvm.frameescape(...) -declare i8* @llvm.x86.seh.exceptioninfo(i8*, i8*) +declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) { entry: @@ -43,14 +43,16 @@ eh.resume: ; preds = %lpad define internal i32 @"filt$main"() { entry: - %0 = tail call i8* @llvm.frameaddress(i32 1) - %1 = tail call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %0, i32 0) - %__exceptioncode = bitcast i8* %1 to i32* - %2 = tail call i8* @llvm.x86.seh.exceptioninfo(i8* bitcast (i32 ()* @main to i8*), i8* %0) - %3 = bitcast i8* %2 to i32** - %4 = load i32*, i32** %3, align 4 - %5 = load i32, i32* %4, align 4 - store i32 %5, i32* %__exceptioncode, align 4 + %ebp = tail call i8* @llvm.frameaddress(i32 1) + %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp) + %code.i8 = tail call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0) + %__exceptioncode = bitcast i8* %code.i8 to i32* + %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20 + %0 = bitcast i8* %info.addr to i32*** + %1 = load i32**, i32*** %0, align 4 + %2 = load i32*, i32** %1, align 4 + %3 = load i32, i32* %2, align 4 + store i32 %3, i32* %__exceptioncode, align 4 ret i32 1 } @@ -76,10 +78,17 @@ entry: ; CHECK: calll _printf ; CHECK: .section .xdata,"dr" +; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1 ; CHECK: L__ehtable$main ; CHECK-NEXT: .long -1 ; CHECK-NEXT: .long _filt$main ; CHECK-NEXT: .long Ltmp{{[0-9]+}} ; CHECK-LABEL: _filt$main: -; CHECK: movl +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: movl (%ebp), %[[oldebp:[a-z]+]] +; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]] +; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]] +; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]] +; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}}) diff --git a/test/CodeGen/X86/seh-filter-no-personality.ll b/test/CodeGen/X86/seh-filter-no-personality.ll new file mode 100644 index 0000000000000..87bc9c93f4004 --- /dev/null +++ b/test/CodeGen/X86/seh-filter-no-personality.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s + +; Mostly make sure that llvm.x86.seh.recoverfp doesn't crash if the parent +; function lacks a personality. + +declare i8* @llvm.frameaddress(i32) +declare i8* @llvm.x86.seh.recoverfp(i8*, i8*) + +define i32 @main() { +entry: + ret i32 0 +} + +define internal i32 @"filt$main"() { +entry: + %ebp = tail call i8* @llvm.frameaddress(i32 1) + %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp) + %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20 + %0 = bitcast i8* %info.addr to i32*** + %1 = load i32**, i32*** %0, align 4 + %2 = load i32*, i32** %1, align 4 + %3 = load i32, i32* %2, align 4 + %matches = icmp eq i32 %3, u0xC0000005 + %r = zext i1 %matches to i32 + ret i32 %r +} + +; CHECK: _main: +; CHECK: xorl %eax, %eax +; CHECK: retl + +; CHECK: _filt$main: +; CHECK: retl diff --git a/test/CodeGen/X86/seh-safe-div-win32.ll b/test/CodeGen/X86/seh-safe-div-win32.ll index 0f76ec07a6b61..b1bcde2c7ff3b 100644 --- a/test/CodeGen/X86/seh-safe-div-win32.ll +++ b/test/CodeGen/X86/seh-safe-div-win32.ll @@ -122,27 +122,30 @@ entry: ; ... ; } EXCEPTION_RECORD; -; FIXME: Use llvm.eh.exceptioninfo for this. -declare i32 @safe_div_filt0() -declare i32 @safe_div_filt1() -; define i32 @safe_div_filt0() { -; %eh_ptrs_c = bitcast i8* %eh_ptrs to i32** -; %eh_rec = load i32*, i32** %eh_ptrs_c -; %eh_code = load i32, i32* %eh_rec -; ; EXCEPTION_ACCESS_VIOLATION = 0xC0000005 -; %cmp = icmp eq i32 %eh_code, 3221225477 -; %filt.res = zext i1 %cmp to i32 -; ret i32 %filt.res -; } -; define i32 @safe_div_filt1() { -; %eh_ptrs_c = bitcast i8* %eh_ptrs to i32** -; %eh_rec = load i32*, i32** %eh_ptrs_c -; %eh_code = load i32, i32* %eh_rec -; ; EXCEPTION_INT_DIVIDE_BY_ZERO = 0xC0000094 -; %cmp = icmp eq i32 %eh_code, 3221225620 -; %filt.res = zext i1 %cmp to i32 -; ret i32 %filt.res -; } +define i32 @safe_div_filt0() { + %ebp = call i8* @llvm.frameaddress(i32 1) + %eh_ptrs.addr.i8 = getelementptr inbounds i8, i8* %ebp, i32 -20 + %eh_ptrs.addr = bitcast i8* %eh_ptrs.addr.i8 to i32*** + %eh_ptrs = load i32**, i32*** %eh_ptrs.addr + %eh_rec = load i32*, i32** %eh_ptrs + %eh_code = load i32, i32* %eh_rec + ; EXCEPTION_ACCESS_VIOLATION = 0xC0000005 + %cmp = icmp eq i32 %eh_code, 3221225477 + %filt.res = zext i1 %cmp to i32 + ret i32 %filt.res +} +define i32 @safe_div_filt1() { + %ebp = call i8* @llvm.frameaddress(i32 1) + %eh_ptrs.addr.i8 = getelementptr inbounds i8, i8* %ebp, i32 -20 + %eh_ptrs.addr = bitcast i8* %eh_ptrs.addr.i8 to i32*** + %eh_ptrs = load i32**, i32*** %eh_ptrs.addr + %eh_rec = load i32*, i32** %eh_ptrs + %eh_code = load i32, i32* %eh_rec + ; EXCEPTION_INT_DIVIDE_BY_ZERO = 0xC0000094 + %cmp = icmp eq i32 %eh_code, 3221225620 + %filt.res = zext i1 %cmp to i32 + ret i32 %filt.res +} @str_result = internal constant [21 x i8] c"safe_div result: %d\0A\00" @@ -170,3 +173,4 @@ declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind declare void @puts(i8*) declare void @printf(i8*, ...) declare void @abort() +declare i8* @llvm.frameaddress(i32) diff --git a/test/CodeGen/X86/shift-combine.ll b/test/CodeGen/X86/shift-combine.ll index ec62bcdcdba1b..43301041a0b69 100644 --- a/test/CodeGen/X86/shift-combine.ll +++ b/test/CodeGen/X86/shift-combine.ll @@ -17,3 +17,62 @@ entry: ret i32 %tmp5 } +define i32* @test_exact1(i32 %a, i32 %b, i32* %x) { +; CHECK-LABEL: test_exact1: +; CHECK: sarl % + + %sub = sub i32 %b, %a + %shr = ashr exact i32 %sub, 3 + %gep = getelementptr inbounds i32, i32* %x, i32 %shr + ret i32* %gep +} + +define i32* @test_exact2(i32 %a, i32 %b, i32* %x) { +; CHECK-LABEL: test_exact2: +; CHECK: sarl % + + %sub = sub i32 %b, %a + %shr = ashr exact i32 %sub, 3 + %gep = getelementptr inbounds i32, i32* %x, i32 %shr + ret i32* %gep +} + +define i32* @test_exact3(i32 %a, i32 %b, i32* %x) { +; CHECK-LABEL: test_exact3: +; CHECK-NOT: sarl + + %sub = sub i32 %b, %a + %shr = ashr exact i32 %sub, 2 + %gep = getelementptr inbounds i32, i32* %x, i32 %shr + ret i32* %gep +} + +define i32* @test_exact4(i32 %a, i32 %b, i32* %x) { +; CHECK-LABEL: test_exact4: +; CHECK: shrl % + + %sub = sub i32 %b, %a + %shr = lshr exact i32 %sub, 3 + %gep = getelementptr inbounds i32, i32* %x, i32 %shr + ret i32* %gep +} + +define i32* @test_exact5(i32 %a, i32 %b, i32* %x) { +; CHECK-LABEL: test_exact5: +; CHECK: shrl % + + %sub = sub i32 %b, %a + %shr = lshr exact i32 %sub, 3 + %gep = getelementptr inbounds i32, i32* %x, i32 %shr + ret i32* %gep +} + +define i32* @test_exact6(i32 %a, i32 %b, i32* %x) { +; CHECK-LABEL: test_exact6: +; CHECK-NOT: shrl + + %sub = sub i32 %b, %a + %shr = lshr exact i32 %sub, 2 + %gep = getelementptr inbounds i32, i32* %x, i32 %shr + ret i32* %gep +} diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll index 373fa53c970f8..0f8d9f4d713fa 100644 --- a/test/CodeGen/X86/sqrt-fastmath.ll +++ b/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 -recip=!sqrtf,!vec-sqrtf,!divf,!vec-divf | FileCheck %s --check-prefix=NORECIP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE declare double @__sqrt_finite(double) #0 @@ -10,10 +10,10 @@ declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0 define double @fd(double %d) #0 { -; CHECK-LABEL: fd: -; CHECK: # BB#0: -; CHECK-NEXT: sqrtsd %xmm0, %xmm0 -; CHECK-NEXT: retq +; NORECIP-LABEL: fd: +; NORECIP: # BB#0: +; NORECIP-NEXT: sqrtsd %xmm0, %xmm0 +; NORECIP-NEXT: retq ; ; ESTIMATE-LABEL: fd: ; ESTIMATE: # BB#0: @@ -25,10 +25,10 @@ define double @fd(double %d) #0 { define float @ff(float %f) #0 { -; CHECK-LABEL: ff: -; CHECK: # BB#0: -; CHECK-NEXT: sqrtss %xmm0, %xmm0 -; CHECK-NEXT: retq +; NORECIP-LABEL: ff: +; NORECIP: # BB#0: +; NORECIP-NEXT: sqrtss %xmm0, %xmm0 +; NORECIP-NEXT: retq ; ; ESTIMATE-LABEL: ff: ; ESTIMATE: # BB#0: @@ -49,11 +49,11 @@ define float @ff(float %f) #0 { define x86_fp80 @fld(x86_fp80 %ld) #0 { -; CHECK-LABEL: fld: -; CHECK: # BB#0: -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fsqrt -; CHECK-NEXT: retq +; NORECIP-LABEL: fld: +; NORECIP: # BB#0: +; NORECIP-NEXT: fldt {{[0-9]+}}(%rsp) +; NORECIP-NEXT: fsqrt +; NORECIP-NEXT: retq ; ; ESTIMATE-LABEL: fld: ; ESTIMATE: # BB#0: @@ -67,12 +67,12 @@ define x86_fp80 @fld(x86_fp80 %ld) #0 { define float @reciprocal_square_root(float %x) #0 { -; CHECK-LABEL: reciprocal_square_root: -; CHECK: # BB#0: -; CHECK-NEXT: sqrtss %xmm0, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: divss %xmm1, %xmm0 -; CHECK-NEXT: retq +; NORECIP-LABEL: reciprocal_square_root: +; NORECIP: # BB#0: +; NORECIP-NEXT: sqrtss %xmm0, %xmm1 +; NORECIP-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NORECIP-NEXT: divss %xmm1, %xmm0 +; NORECIP-NEXT: retq ; ; ESTIMATE-LABEL: reciprocal_square_root: ; ESTIMATE: # BB#0: @@ -89,12 +89,12 @@ define float @reciprocal_square_root(float %x) #0 { } define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 { -; CHECK-LABEL: reciprocal_square_root_v4f32: -; CHECK: # BB#0: -; CHECK-NEXT: sqrtps %xmm0, %xmm1 -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; CHECK-NEXT: divps %xmm1, %xmm0 -; CHECK-NEXT: retq +; NORECIP-LABEL: reciprocal_square_root_v4f32: +; NORECIP: # BB#0: +; NORECIP-NEXT: sqrtps %xmm0, %xmm1 +; NORECIP-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; NORECIP-NEXT: divps %xmm1, %xmm0 +; NORECIP-NEXT: retq ; ; ESTIMATE-LABEL: reciprocal_square_root_v4f32: ; ESTIMATE: # BB#0: @@ -111,15 +111,15 @@ define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 { } define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 { -; CHECK-LABEL: reciprocal_square_root_v8f32: -; CHECK: # BB#0: -; CHECK-NEXT: sqrtps %xmm1, %xmm2 -; CHECK-NEXT: sqrtps %xmm0, %xmm3 -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: divps %xmm3, %xmm0 -; CHECK-NEXT: divps %xmm2, %xmm1 -; CHECK-NEXT: retq +; NORECIP-LABEL: reciprocal_square_root_v8f32: +; NORECIP: # BB#0: +; NORECIP-NEXT: sqrtps %xmm1, %xmm2 +; NORECIP-NEXT: sqrtps %xmm0, %xmm3 +; NORECIP-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; NORECIP-NEXT: movaps %xmm1, %xmm0 +; NORECIP-NEXT: divps %xmm3, %xmm0 +; NORECIP-NEXT: divps %xmm2, %xmm1 +; NORECIP-NEXT: retq ; ; ESTIMATE-LABEL: reciprocal_square_root_v8f32: ; ESTIMATE: # BB#0: diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll index 95f0c3d3a188b..63acf5f4f96f4 100644 --- a/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -314,7 +314,13 @@ define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) { } declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone -; TODO stack_fold_cvtsd2ss +define float @stack_fold_cvtsd2ss(double %a0) optsize { + ;CHECK-LABEL: stack_fold_cvtsd2ss + ;CHECK: cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = fptrunc double %a0 to float + ret float %2 +} define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize { ;CHECK-LABEL: stack_fold_cvtsd2ss_int diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll index e930d244638a8..a164fbbc7a6ae 100644 --- a/test/CodeGen/X86/stack-folding-int-avx2.ll +++ b/test/CodeGen/X86/stack-folding-int-avx2.ll @@ -867,9 +867,21 @@ define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) { ret <8 x i32> %2 } -; TODO stack_fold_pshufhw +define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) { + ;CHECK-LABEL: stack_fold_vpshufhw + ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> + ret <16 x i16> %2 +} -; TODO stack_fold_pshuflw +define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) { + ;CHECK-LABEL: stack_fold_vpshuflw + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> + ret <16 x i16> %2 +} define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) { ;CHECK-LABEL: stack_fold_psignb diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll index 6bb0d8980e5bd..e18476cee53c5 100644 --- a/test/CodeGen/X86/statepoint-stackmap-format.ll +++ b/test/CodeGen/X86/statepoint-stackmap-format.ll @@ -1,10 +1,11 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc < %s -mtriple="x86_64-pc-linux-gnu" | FileCheck %s +; RUN: llc < %s -mtriple="x86_64-pc-win64-coff" | FileCheck %s + ; This test is a sanity check to ensure statepoints are generating StackMap ; sections correctly. This is not intended to be a rigorous test of the ; StackMap format (see the stackmap tests for that). target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-pc-linux-gnu" declare zeroext i1 @return_i1() diff --git a/test/CodeGen/X86/system-intrinsics-64.ll b/test/CodeGen/X86/system-intrinsics-64.ll new file mode 100644 index 0000000000000..96c4417733902 --- /dev/null +++ b/test/CodeGen/X86/system-intrinsics-64.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define void @test_fxsave(i8* %ptr) { +; CHECK-LABEL: test_fxsave +; CHECK: fxsave + call void @llvm.x86.fxsave(i8* %ptr) + ret void; +} +declare void @llvm.x86.fxsave(i8*) + +define void @test_fxsave64(i8* %ptr) { +; CHECK-LABEL: test_fxsave64 +; CHECK: fxsave64 + call void @llvm.x86.fxsave64(i8* %ptr) + ret void; +} +declare void @llvm.x86.fxsave64(i8*) + +define void @test_fxrstor(i8* %ptr) { +; CHECK-LABEL: test_fxrstor +; CHECK: fxrstor + call void @llvm.x86.fxrstor(i8* %ptr) + ret void; +} +declare void @llvm.x86.fxrstor(i8*) + +define void @test_fxrstor64(i8* %ptr) { +; CHECK-LABEL: test_fxrstor64 +; CHECK: fxrstor64 + call void @llvm.x86.fxrstor64(i8* %ptr) + ret void; +} +declare void @llvm.x86.fxrstor64(i8*) diff --git a/test/CodeGen/X86/system-intrinsics.ll b/test/CodeGen/X86/system-intrinsics.ll new file mode 100644 index 0000000000000..84fcd052d7dbf --- /dev/null +++ b/test/CodeGen/X86/system-intrinsics.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s + +define void @test_fxsave(i8* %ptr) { +; CHECK-LABEL: test_fxsave +; CHECK: fxsave + call void @llvm.x86.fxsave(i8* %ptr) + ret void; +} +declare void @llvm.x86.fxsave(i8*) + +define void @test_fxrstor(i8* %ptr) { +; CHECK-LABEL: test_fxrstor +; CHECK: fxrstor + call void @llvm.x86.fxrstor(i8* %ptr) + ret void; +} +declare void @llvm.x86.fxrstor(i8*) diff --git a/test/CodeGen/X86/twoaddr-lea.ll b/test/CodeGen/X86/twoaddr-lea.ll index b5ca0275d8d69..5779cf33ac84c 100644 --- a/test/CodeGen/X86/twoaddr-lea.ll +++ b/test/CodeGen/X86/twoaddr-lea.ll @@ -25,8 +25,7 @@ define i32 @test2(i32 inreg %a, i32 inreg %b, i32 %c, i32 %d) nounwind { entry: ; CHECK-LABEL: test2: ; CHECK: leal -; CHECK-NOT: leal -; CHECK-NOT: mov +; CHECK-NEXT: addl ; CHECK-NEXT: addl ; CHECK-NEXT: ret %add = add i32 %b, %a diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 8dded07af7d4d..ca8be65075b90 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -50,31 +50,15 @@ define <2 x double> @sitofp_2vf64_i32(<4 x i32> %a) { define <2 x double> @sitofp_2vf64_i16(<8 x i16> %a) { ; SSE2-LABEL: sitofp_2vf64_i16: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: movswq %cx, %rcx -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: sitofp_2vf64_i16: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: movswq %ax, %rax -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movswq %cx, %rcx -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1> %cvt = sitofp <2 x i16> %shuf to <2 x double> @@ -86,30 +70,14 @@ define <2 x double> @sitofp_2vf64_i8(<16 x i8> %a) { ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: movsbq %cl, %rcx -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: sitofp_2vf64_i8: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: movsbq %al, %rax -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movsbq %cl, %rcx -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1> %cvt = sitofp <2 x i8> %shuf to <2 x double> diff --git a/test/CodeGen/X86/vec_shift8.ll b/test/CodeGen/X86/vec_shift8.ll deleted file mode 100644 index 9d19f667ea9b2..0000000000000 --- a/test/CodeGen/X86/vec_shift8.ll +++ /dev/null @@ -1,527 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX - -; -; Vectorized integer shifts -; - -define <2 x i64> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp { -entry: -; ALL-NOT: shll -; -; SSE2: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psllw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psllw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psllw $2, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psllw $1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpsllw $12, %xmm1, %xmm2 -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $1, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq - %shl = shl <8 x i16> %r, %a - %tmp2 = bitcast <8 x i16> %shl to <2 x i64> - ret <2 x i64> %tmp2 -} - -define <2 x i64> @shl_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { -entry: -; SSE2: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psllw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psllw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq - %shl = shl <16 x i8> %r, %a - %tmp2 = bitcast <16 x i8> %shl to <2 x i64> - ret <2 x i64> %tmp2 -} - -define <2 x i64> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp { -entry: -; ALL-NOT: sarw -; -; SSE2: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psraw $4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psraw $2, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpsllw $12, %xmm1, %xmm2 -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $8, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $4, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq - %ashr = ashr <8 x i16> %r, %a - %tmp2 = bitcast <8 x i16> %ashr to <2 x i64> - ret <2 x i64> %tmp2 -} - -define <2 x i64> @ashr_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { -entry: -; ALL-NOT: sarb -; -; SSE2: punpckhbw {{.*#}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $2, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE41-NEXT: punpckhbw {{.*#}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $4, %xmm4 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $2, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: punpcklbw {{.*#}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm2, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhbw {{.*#}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX-NEXT: vpunpckhbw {{.*#}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpunpcklbw {{.*#}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq - %ashr = ashr <16 x i8> %r, %a - %tmp2 = bitcast <16 x i8> %ashr to <2 x i64> - ret <2 x i64> %tmp2 -} - -define <2 x i64> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp { -entry: -; ALL-NOT: shrl -; -; SSE2: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $2, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpsllw $12, %xmm1, %xmm2 -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq - %lshr = lshr <8 x i16> %r, %a - %tmp2 = bitcast <8 x i16> %lshr to <2 x i64> - ret <2 x i64> %tmp2 -} - -define <2 x i64> @lshr_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { -entry: -; ALL-NOT: shrb -; -; SSE2: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $1, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq - %lshr = lshr <16 x i8> %r, %a - %tmp2 = bitcast <16 x i8> %lshr to <2 x i64> - ret <2 x i64> %tmp2 -} diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index e6acc7efaf394..aafc05b2ed4ce 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -117,6 +117,46 @@ entry: ret <4 x i64>%B } +define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_2i8_to_i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_2i8_to_i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_2i8_to_i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_2i8_to_i32: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_2i8_to_i32: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41: pmovsxbw %xmm0, %xmm0 +; X32-SSE41-NEXT: movd %xmm0, %eax +; X32-SSE41-NEXT: popl %edx +; X32-SSE41-NEXT: retl +entry: + %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1> + %Ex = sext <2 x i8> %Shuf to <2 x i16> + %Bc = bitcast <2 x i16> %Ex to i32 + ret i32 %Bc +} + define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { ; SSE2-LABEL: load_sext_test1: ; SSE2: # BB#0: # %entry diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll new file mode 100644 index 0000000000000..4fd2f8b51b8b2 --- /dev/null +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -0,0 +1,1041 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +; +; Variable Shifts +; + +define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { +; SSE2-LABEL: var_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movd %xmm1, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: pextrq $1, %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movd %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vpextrq $1, %xmm1, %rcx +; AVX-NEXT: sarq %cl, %rax +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovq %xmm1, %rcx +; AVX-NEXT: sarq %cl, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-NEXT: retq + %shift = ashr <2 x i64> %a, %b + ret <2 x i64> %shift +} + +define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: var_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: sarl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: sarl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: sarl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: sarl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pextrd $1, %xmm1, %ecx +; SSE41-NEXT: sarl %cl, %eax +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: sarl %cl, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: pextrd $2, %xmm1, %ecx +; SSE41-NEXT: sarl %cl, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: sarl %cl, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = ashr <4 x i32> %a, %b + ret <4 x i32> %shift +} + +define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: var_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $2, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psraw $1, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v8i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %shift = ashr <8 x i16> %a, %b + ret <8 x i16> %shift +} + +define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: var_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $4, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = ashr <16 x i8> %a, %b + ret <16 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { +; SSE2-LABEL: splatvar_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movd %xmm2, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rcx +; SSE2-NEXT: sarq %cl, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: pextrq $1, %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movd %xmm1, %rcx +; SSE41-NEXT: sarq %cl, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: retq + %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer + %shift = ashr <2 x i64> %a, %splat + ret <2 x i64> %shift +} + +define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: splatvar_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: psrad %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; SSE41-NEXT: psrad %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + %shift = ashr <4 x i32> %a, %splat + ret <4 x i32> %shift +} + +define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: splatvar_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psraw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; SSE41-NEXT: psraw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + %shift = ashr <8 x i16> %a, %splat + ret <8 x i16> %shift +} + +define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: splatvar_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: psllw $5, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm3 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v16i8: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v16i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq + %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = ashr <16 x i8> %a, %splat + ret <16 x i8> %shift +} + +; +; Constant Shifts +; + +define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { +; SSE2-LABEL: constant_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: sarq %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: sarq $7, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: sarq $7, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: sarq %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: sarq $7, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: sarq %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %shift = ashr <2 x i64> %a, <i64 1, i64 7> + ret <2 x i64> %shift +} + +define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { +; SSE2-LABEL: constant_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: sarl $7, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: sarl $5, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: sarl $4, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: sarl $6, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: sarl $5, %eax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: sarl $4, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: sarl $6, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: sarl $7, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: sarl $5, %eax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: sarl $4, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: sarl $6, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: sarl $7, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> + ret <4 x i32> %shift +} + +define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { +; SSE2-LABEL: constant_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $8, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $4, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $2, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v8i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> + ret <8 x i16> %shift +} + +define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { +; SSE2-LABEL: constant_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE2-NEXT: psllw $5, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: psraw $2, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $4, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm5 +; SSE2-NEXT: psraw $2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psraw $1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE41-NEXT: psllw $5, %xmm3 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $4, %xmm4 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $2, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psraw $1, %xmm4 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psraw $4, %xmm3 +; SSE41-NEXT: pblendvb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psraw $2, %xmm3 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: psraw $1, %xmm3 +; SSE41-NEXT: paddw %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 +; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> + ret <16 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { +; SSE2-LABEL: splatconstant_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: sarq $7, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: sarq $7, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatconstant_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: sarq $7, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: sarq $7, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: sarq $7, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: sarq $7, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %shift = ashr <2 x i64> %a, <i64 7, i64 7> + ret <2 x i64> %shift +} + +define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) { +; SSE-LABEL: splatconstant_shift_v4i32: +; SSE: # BB#0: +; SSE-NEXT: psrad $5, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> + ret <4 x i32> %shift +} + +define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { +; SSE-LABEL: splatconstant_shift_v8i16: +; SSE: # BB#0: +; SSE-NEXT: psraw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %shift +} + +define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { +; SSE-LABEL: splatconstant_shift_v16i8: +; SSE: # BB#0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <16 x i8> %shift +} diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll new file mode 100644 index 0000000000000..3fc377af56500 --- /dev/null +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -0,0 +1,767 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +; +; Variable Shifts +; + +define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: var_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vmovq %xmm3, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm4 +; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: vmovq %xmm3, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <4 x i64> %a, %b + ret <4 x i64> %shift +} + +define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: var_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrd $1, %xmm2, %eax +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrd $1, %xmm3, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vmovd %xmm2, %edx +; AVX1-NEXT: vmovd %xmm3, %ecx +; AVX1-NEXT: sarl %cl, %edx +; AVX1-NEXT: vmovd %edx, %xmm4 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 +; AVX1-NEXT: vpextrd $2, %xmm2, %eax +; AVX1-NEXT: vpextrd $2, %xmm3, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4 +; AVX1-NEXT: vpextrd $3, %xmm2, %eax +; AVX1-NEXT: vpextrd $3, %xmm3, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2 +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %edx +; AVX1-NEXT: vmovd %edx, %xmm3 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: sarl %cl, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <8 x i32> %a, %b + ret <8 x i32> %shift +} + +define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: var_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm4 +; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <16 x i16> %a, %b + ret <16 x i16> %shift +} + +define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: var_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 +; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm4 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm4 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <32 x i8> %a, %b + ret <32 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: splatvar_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm2, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: movb %al, %cl +; AVX1-NEXT: sarq %cl, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vmovq %xmm2, %rsi +; AVX1-NEXT: vmovq %xmm1, %rdx +; AVX1-NEXT: movb %dl, %cl +; AVX1-NEXT: sarq %cl, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: movb %al, %cl +; AVX1-NEXT: sarq %cl, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movb %dl, %cl +; AVX1-NEXT: sarq %cl, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm4 +; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: vmovq %xmm3, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: sarq %cl, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer + %shift = ashr <4 x i64> %a, %splat + ret <4 x i64> %shift +} + +define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: splatvar_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + %shift = ashr <8 x i32> %a, %splat + ret <8 x i32> %shift +} + +define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: splatvar_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: movzwl %ax, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + %shift = ashr <16 x i16> %a, %splat + ret <16 x i16> %shift +} + +define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: splatvar_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm9 +; AVX1-NEXT: vpblendvb %xmm9, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm8 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 +; AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $4, %xmm5, %xmm3 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm5, %xmm2 +; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3 +; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3 +; AVX1-NEXT: vpblendvb %xmm9, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + %shift = ashr <32 x i8> %a, %splat + ret <32 x i8> %shift +} + +; +; Constant Shifts +; + +define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { +; AVX1-LABEL: constant_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: sarq $62, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: sarq $31, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: sarq $7, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: sarq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: sarq $62, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: sarq $31, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: sarq $7, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: sarq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> + ret <4 x i64> %shift +} + +define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { +; AVX1-LABEL: constant_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: sarl $9, %eax +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: sarl $8, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %eax +; AVX1-NEXT: sarl $8, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: sarl $7, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: sarl $5, %eax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: sarl $4, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: sarl $6, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: sarl $7, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> + ret <8 x i32> %shift +} + +define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { +; AVX1-LABEL: constant_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsraw $8, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + ret <16 x i16> %shift +} + +define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { +; AVX1-LABEL: constant_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm9 +; AVX1-NEXT: vpblendvb %xmm9, %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm8 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 +; AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $4, %xmm5, %xmm3 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm5, %xmm2 +; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3 +; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3 +; AVX1-NEXT: vpblendvb %xmm9, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> + ret <32 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { +; AVX1-LABEL: splatconstant_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: sarq $7, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: sarq $7, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: sarq $7, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: sarq $7, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: sarq $7, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: sarq $7, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: sarq $7, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: sarq $7, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> + ret <4 x i64> %shift +} + +define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { +; AVX1-LABEL: splatconstant_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> + ret <8 x i32> %shift +} + +define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { +; AVX1-LABEL: splatconstant_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <16 x i16> %shift +} + +define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { +; AVX1-LABEL: splatconstant_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <32 x i8> %shift +} diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll new file mode 100644 index 0000000000000..f5a7e28383fe5 --- /dev/null +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -0,0 +1,778 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +; +; Variable Shifts +; + +define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { +; SSE2-LABEL: var_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlq %xmm3, %xmm2 +; SSE2-NEXT: psrlq %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: psrlq %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = lshr <2 x i64> %a, %b + ret <2 x i64> %shift +} + +define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: var_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pextrd $1, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: pextrd $2, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = lshr <4 x i32> %a, %b + ret <4 x i32> %shift +} + +define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: var_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrlw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v8i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %shift = lshr <8 x i16> %a, %b + ret <8 x i16> %shift +} + +define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: var_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = lshr <16 x i8> %a, %b + ret <16 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { +; SSE-LABEL: splatvar_shift_v2i64: +; SSE: # BB#0: +; SSE-NEXT: psrlq %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer + %shift = lshr <2 x i64> %a, %splat + ret <2 x i64> %shift +} + +define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: splatvar_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: psrld %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; SSE41-NEXT: psrld %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + %shift = lshr <4 x i32> %a, %splat + ret <4 x i32> %shift +} + +define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: splatvar_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; SSE41-NEXT: psrlw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + %shift = lshr <8 x i16> %a, %splat + ret <8 x i16> %shift +} + +define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: splatvar_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm4 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v16i8: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v16i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq + %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = lshr <16 x i8> %a, %splat + ret <16 x i8> %shift +} + +; +; Constant Shifts +; + +define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { +; SSE2-LABEL: constant_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $7, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlq $7, %xmm1 +; SSE41-NEXT: psrlq $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = lshr <2 x i64> %a, <i64 1, i64 7> + ret <2 x i64> %shift +} + +define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { +; SSE2-LABEL: constant_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: shrl $7, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: shrl $5, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shrl $4, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shrl $6, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: shrl $5, %eax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: shrl $4, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: shrl $6, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: shrl $7, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: shrl $5, %eax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: shrl $4, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: shrl $6, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> + ret <4 x i32> %shift +} + +define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { +; SSE2-LABEL: constant_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v8i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> + ret <8 x i16> %shift +} + +define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { +; SSE2-LABEL: constant_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE41-NEXT: psllw $5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> + ret <16 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { +; SSE-LABEL: splatconstant_shift_v2i64: +; SSE: # BB#0: +; SSE-NEXT: psrlq $7, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = lshr <2 x i64> %a, <i64 7, i64 7> + ret <2 x i64> %shift +} + +define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) { +; SSE-LABEL: splatconstant_shift_v4i32: +; SSE: # BB#0: +; SSE-NEXT: psrld $5, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> + ret <4 x i32> %shift +} + +define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { +; SSE-LABEL: splatconstant_shift_v8i16: +; SSE: # BB#0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %shift +} + +define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { +; SSE-LABEL: splatconstant_shift_v16i8: +; SSE: # BB#0: +; SSE-NEXT: psrlw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vpsrlw $3, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0 +; AVX-NEXT: retq + %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <16 x i8> %shift +} diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll new file mode 100644 index 0000000000000..d200abd5f8755 --- /dev/null +++ b/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -0,0 +1,548 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +; +; Variable Shifts +; + +define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: var_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <4 x i64> %a, %b + ret <4 x i64> %shift +} + +define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: var_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrd $1, %xmm2, %eax +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrd $1, %xmm3, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vmovd %xmm2, %edx +; AVX1-NEXT: vmovd %xmm3, %ecx +; AVX1-NEXT: shrl %cl, %edx +; AVX1-NEXT: vmovd %edx, %xmm4 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 +; AVX1-NEXT: vpextrd $2, %xmm2, %eax +; AVX1-NEXT: vpextrd $2, %xmm3, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4 +; AVX1-NEXT: vpextrd $3, %xmm2, %eax +; AVX1-NEXT: vpextrd $3, %xmm3, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2 +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %edx +; AVX1-NEXT: vmovd %edx, %xmm3 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: shrl %cl, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <8 x i32> %a, %b + ret <8 x i32> %shift +} + +define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: var_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 +; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <16 x i16> %a, %b + ret <16 x i16> %shift +} + +define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: var_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <32 x i8> %a, %b + ret <32 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: splatvar_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer + %shift = lshr <4 x i64> %a, %splat + ret <4 x i64> %shift +} + +define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: splatvar_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + %shift = lshr <8 x i32> %a, %splat + ret <8 x i32> %shift +} + +define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: splatvar_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: movzwl %ax, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + %shift = lshr <16 x i16> %a, %splat + ret <16 x i16> %shift +} + +define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: splatvar_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + %shift = lshr <32 x i8> %a, %splat + ret <32 x i8> %shift +} + +; +; Constant Shifts +; + +define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { +; AVX1-LABEL: constant_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> + ret <4 x i64> %shift +} + +define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { +; AVX1-LABEL: constant_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: shrl $9, %eax +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: shrl $8, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %eax +; AVX1-NEXT: shrl $8, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: shrl $5, %eax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: shrl $4, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: shrl $6, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> + ret <8 x i32> %shift +} + +define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { +; AVX1-LABEL: constant_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664] +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + ret <16 x i16> %shift +} + +define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { +; AVX1-LABEL: constant_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> + ret <32 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { +; AVX1-LABEL: splatconstant_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> + ret <4 x i64> %shift +} + +define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { +; AVX1-LABEL: splatconstant_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> + ret <8 x i32> %shift +} + +define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { +; AVX1-LABEL: splatconstant_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <16 x i16> %shift +} + +define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { +; AVX1-LABEL: splatconstant_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlw $3, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0 +; AVX2-NEXT: retq + %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <32 x i8> %shift +} diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll new file mode 100644 index 0000000000000..3ac31ea636765 --- /dev/null +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -0,0 +1,639 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +; +; Variable Shifts +; + +define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { +; SSE2-LABEL: var_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psllq %xmm3, %xmm2 +; SSE2-NEXT: psllq %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: psllq %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = shl <2 x i64> %a, %b + ret <2 x i64> %shift +} + +define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: var_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pslld $23, %xmm1 +; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = shl <4 x i32> %a, %b + ret <4 x i32> %shift +} + +define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: var_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: psllw $12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: psllw $1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllw $12, %xmm0 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psllw $8, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $2, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $1, %xmm1 +; SSE41-NEXT: paddw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: var_shift_v8i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %shift = shl <8 x i16> %a, %b + ret <8 x i16> %shift +} + +define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: var_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: psllw $5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: var_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $4, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $2, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: var_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = shl <16 x i8> %a, %b + ret <16 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { +; SSE-LABEL: splatvar_shift_v2i64: +; SSE: # BB#0: +; SSE-NEXT: psllq %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer + %shift = shl <2 x i64> %a, %splat + ret <2 x i64> %shift +} + +define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: splatvar_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: pslld %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; SSE41-NEXT: pslld %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer + %shift = shl <4 x i32> %a, %splat + ret <4 x i32> %shift +} + +define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: splatvar_shift_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; SSE41-NEXT: psllw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: splatvar_shift_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer + %shift = shl <8 x i16> %a, %splat + ret <8 x i16> %shift +} + +define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: splatvar_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: splatvar_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psllw $4, %xmm4 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $2, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: paddb %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: splatvar_shift_v16i8: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v16i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq + %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = shl <16 x i8> %a, %splat + ret <16 x i8> %shift +} + +; +; Constant Shifts +; + +define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { +; SSE2-LABEL: constant_shift_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllq $7, %xmm1 +; SSE2-NEXT: psllq $1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq $7, %xmm1 +; SSE41-NEXT: psllq $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = shl <2 x i64> %a, <i64 1, i64 7> + ret <2 x i64> %shift +} + +define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { +; SSE2-LABEL: constant_shift_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_shift_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq + %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> + ret <4 x i32> %shift +} + +define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { +; SSE-LABEL: constant_shift_v8i16: +; SSE: # BB#0: +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: constant_shift_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> + ret <8 x i16> %shift +} + +define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { +; SSE2-LABEL: constant_shift_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE2-NEXT: psllw $5, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; SSE41-NEXT: psllw $5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllw $4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllw $2, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: paddb %xmm2, %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm0 +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> + ret <16 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { +; SSE-LABEL: splatconstant_shift_v2i64: +; SSE: # BB#0: +; SSE-NEXT: psllq $7, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vpsllq $7, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = shl <2 x i64> %a, <i64 7, i64 7> + ret <2 x i64> %shift +} + +define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) { +; SSE-LABEL: splatconstant_shift_v4i32: +; SSE: # BB#0: +; SSE-NEXT: pslld $5, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpslld $5, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5> + ret <4 x i32> %shift +} + +define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { +; SSE-LABEL: splatconstant_shift_v8i16: +; SSE: # BB#0: +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX-NEXT: retq + %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <8 x i16> %shift +} + +define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { +; SSE-LABEL: splatconstant_shift_v16i8: +; SSE: # BB#0: +; SSE-NEXT: psllw $3, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: splatconstant_shift_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vpsllw $3, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0 +; AVX-NEXT: retq + %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <16 x i8> %shift +} diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll new file mode 100644 index 0000000000000..7c13c0ae4716d --- /dev/null +++ b/test/CodeGen/X86/vector-shift-shl-256.ll @@ -0,0 +1,459 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +; +; Variable Shifts +; + +define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: var_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <4 x i64> %a, %b + ret <4 x i64> %shift +} + +define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: var_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <8 x i32> %a, %b + ret <8 x i32> %shift +} + +define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: var_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsllw $8, %xmm4, %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm2, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm2, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm4 +; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <16 x i16> %a, %b + ret <16 x i16> %shift +} + +define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: var_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <32 x i8> %a, %b + ret <32 x i8> %shift +} + +; +; Uniform Variable Shifts +; + +define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: splatvar_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer + %shift = shl <4 x i64> %a, %splat + ret <4 x i64> %shift +} + +define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: splatvar_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer + %shift = shl <8 x i32> %a, %splat + ret <8 x i32> %shift +} + +define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: splatvar_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: movzwl %ax, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer + %shift = shl <16 x i16> %a, %splat + ret <16 x i16> %shift +} + +define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: splatvar_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer + %shift = shl <32 x i8> %a, %splat + ret <32 x i8> %shift +} + +; +; Constant Shifts +; + +define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { +; AVX1-LABEL: constant_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllq $62, %xmm1, %xmm2 +; AVX1-NEXT: vpsllq $31, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsllq $7, %xmm0, %xmm2 +; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62> + ret <4 x i64> %shift +} + +define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { +; AVX1-LABEL: constant_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> + ret <8 x i32> %shift +} + +define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { +; AVX1-LABEL: constant_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> + ret <16 x i16> %shift +} + +define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { +; AVX1-LABEL: constant_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm4 # xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> + ret <32 x i8> %shift +} + +; +; Uniform Constant Shifts +; + +define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { +; AVX1-LABEL: splatconstant_shift_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsllq $7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7> + ret <4 x i64> %shift +} + +define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { +; AVX1-LABEL: splatconstant_shift_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $5, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslld $5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $5, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> + ret <8 x i32> %shift +} + +define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { +; AVX1-LABEL: splatconstant_shift_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> + ret <16 x i16> %shift +} + +define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { +; AVX1-LABEL: splatconstant_shift_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> + ret <32 x i8> %shift +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 53d13c86657b5..124d6e8c8ba2a 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -653,28 +653,28 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE2: # BB#0: -; SSE2-NEXT: shll $8, %edi -; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: shll $8, %edi +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pinsrw $2, %edi, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: shll $8, %edi -; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: shll $8, %edi +; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrb $5, %edi, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $5, %edi, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> @@ -684,28 +684,28 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { ; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSE2: # BB#0: -; SSE2-NEXT: shll $8, %edi -; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: shll $8, %edi +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pinsrw $7, %edi, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSSE3: # BB#0: -; SSSE3-NEXT: shll $8, %edi -; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: shll $8, %edi +; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: pinsrw $7, %edi, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrb $15, %edi, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %edi, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16> @@ -716,27 +716,27 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE2: # BB#0: ; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pinsrw $1, %eax, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: ; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: pinsrw $1, %eax, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrb $2, %edi, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, %edi, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 3 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> @@ -1341,12 +1341,12 @@ define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz( define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_bitcast_unpack: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_bitcast_unpack: ; AVX: # BB#0: -; AVX-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: retq %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16> %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float> diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 4007f0b2b13bf..6a29d33d6c5e7 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1384,14 +1384,14 @@ define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) { define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_z8zzzzzz: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: pinsrw $1, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_z8zzzzzz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %edi, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3> @@ -1401,14 +1401,14 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzzzz8zz: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: pinsrw $5, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zzzzz8zz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %edi, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0> @@ -1418,14 +1418,14 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zuuzuuz8: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: pinsrw $7, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zuuzuuz8: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %edi, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8> @@ -1435,14 +1435,14 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzBzzzzz: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: pinsrw $2, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zzBzzzzz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %edi, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 3 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7> diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 944ec4b8d3ac7..62bf288a870d1 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -810,30 +810,20 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { } define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { -; AVX1-LABEL: insert_reg_and_zero_v4i64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_reg_and_zero_v4i64: -; AVX2: # BB#0: -; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: retq +; ALL-LABEL: insert_reg_and_zero_v4i64: +; ALL: # BB#0: +; ALL-NEXT: vmovq %rdi, %xmm0 +; ALL-NEXT: retq %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x i64> %shuffle } define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { -; AVX1-LABEL: insert_mem_and_zero_v4i64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_mem_and_zero_v4i64: -; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: retq +; ALL-LABEL: insert_mem_and_zero_v4i64: +; ALL: # BB#0: +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> @@ -874,15 +864,10 @@ define <4 x double> @splat_mem_v4f64(double* %ptr) { } define <4 x i64> @splat_mem_v4i64(i64* %ptr) { -; AVX1-LABEL: splat_mem_v4i64: -; AVX1: # BB#0: -; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splat_mem_v4i64: -; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: splat_mem_v4i64: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> @@ -915,6 +900,60 @@ define <4 x double> @splat_v4f64(<2 x double> %r) { ret <4 x double> %1 } +define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) { +; AVX1-LABEL: splat_mem_v4i64_from_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4i64_from_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq + %v = load <2 x i64>, <2 x i64>* %ptr + %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) { +; AVX1-LABEL: splat_mem_v4f64_from_v2f64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4f64_from_v2f64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq + %v = load <2 x double>, <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) { +; ALL-LABEL: splat128_mem_v4i64_from_v2i64: +; ALL: # BB#0: +; ALL-NEXT: vmovaps (%rdi), %xmm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %v = load <2 x i64>, <2 x i64>* %ptr + %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) { +; ALL-LABEL: splat128_mem_v4f64_from_v2f64: +; ALL: # BB#0: +; ALL-NEXT: vmovaps (%rdi), %xmm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %v = load <2 x double>, <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> + ret <4 x double> %shuffle +} + define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: bitcast_v4f64_0426: ; AVX1: # BB#0: @@ -923,7 +962,7 @@ define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) { ; ; AVX2-LABEL: bitcast_v4f64_0426: ; AVX2: # BB#0: -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: retq %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2> %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float> diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index bb07077b5559c..bc72e0a661777 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2088,15 +2088,10 @@ entry: } define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) { -; AVX1-LABEL: insert_mem_and_zero_v8i32: -; AVX1: # BB#0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_mem_and_zero_v8i32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: retq +; ALL-LABEL: insert_mem_and_zero_v8i32: +; ALL: # BB#0: +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: retq %a = load i32, i32* %ptr %v = insertelement <8 x i32> undef, i32 %a, i32 0 %shuffle = shufflevector <8 x i32> %v, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 2c6c8a3e7ade3..62d4af7809b6b 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -15,8 +15,9 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> ret <8 x double> %shuffle @@ -25,8 +26,9 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> ret <8 x double> %shuffle @@ -35,8 +37,9 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -45,8 +48,11 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00040000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -55,8 +61,11 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00500000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -65,8 +74,11 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_06000000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -75,11 +87,11 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_70000000: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: movl $7, %eax -; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 -; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,0,0,0] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -88,7 +100,10 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vshuff64x2 $160, %zmm0, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> ret <8 x double> %shuffle @@ -97,8 +112,9 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00112233: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> ret <8 x double> %shuffle @@ -107,8 +123,9 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> ret <8 x double> %shuffle @@ -117,7 +134,11 @@ define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_81a3c5e7: ; ALL: # BB#0: -; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> ret <8 x double> %shuffle @@ -126,9 +147,10 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> ret <8 x double> %shuffle @@ -137,9 +159,15 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> ret <8 x double> %shuffle @@ -148,9 +176,13 @@ define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_8823cc67: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> ret <8 x double> %shuffle @@ -159,9 +191,13 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> ret <8 x double> %shuffle @@ -170,9 +206,13 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_9810dc54: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> ret <8 x double> %shuffle @@ -181,9 +221,15 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08194c5d: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> ret <8 x double> %shuffle @@ -192,9 +238,15 @@ define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_2a3b6e7f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> ret <8 x double> %shuffle @@ -203,9 +255,13 @@ define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08192a3b: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x double> %shuffle @@ -214,9 +270,11 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> ret <8 x double> %shuffle @@ -225,9 +283,12 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_091b2d3f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> ret <8 x double> %shuffle @@ -236,9 +297,11 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> ret <8 x double> %shuffle @@ -247,7 +310,10 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00014445: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $64, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> ret <8 x double> %shuffle @@ -256,7 +322,10 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00204464: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $32, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> ret <8 x double> %shuffle @@ -265,7 +334,10 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_03004744: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $12, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> ret <8 x double> %shuffle @@ -274,7 +346,10 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10005444: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $1, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> ret <8 x double> %shuffle @@ -283,7 +358,10 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22006644: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $10, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> ret <8 x double> %shuffle @@ -292,7 +370,10 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_33307774: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $63, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> ret <8 x double> %shuffle @@ -301,7 +382,10 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_32107654: ; ALL: # BB#0: -; ALL-NEXT: vpermpd $27, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> ret <8 x double> %shuffle @@ -310,7 +394,10 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00234467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> ret <8 x double> %shuffle @@ -319,7 +406,10 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> ret <8 x double> %shuffle @@ -328,7 +418,10 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10325476: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> ret <8 x double> %shuffle @@ -337,7 +430,10 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> ret <8 x double> %shuffle @@ -346,7 +442,10 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x double> %shuffle @@ -355,7 +454,10 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10225466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> ret <8 x double> %shuffle @@ -364,8 +466,10 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00015444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> ret <8 x double> %shuffle @@ -374,8 +478,10 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00204644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> ret <8 x double> %shuffle @@ -384,8 +490,10 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_03004474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> ret <8 x double> %shuffle @@ -394,8 +502,10 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10004444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x double> %shuffle @@ -404,8 +514,10 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22006446: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> ret <8 x double> %shuffle @@ -414,8 +526,10 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_33307474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> ret <8 x double> %shuffle @@ -424,8 +538,9 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_32104567: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -434,8 +549,10 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00236744: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> ret <8 x double> %shuffle @@ -444,8 +561,10 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00226644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> ret <8 x double> %shuffle @@ -454,7 +573,9 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10324567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -463,7 +584,9 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11334567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -472,7 +595,9 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x double> %shuffle @@ -481,7 +606,9 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> ret <8 x double> %shuffle @@ -490,8 +617,10 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_002u6u44: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> ret <8 x double> %shuffle @@ -500,8 +629,10 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> ret <8 x double> %shuffle @@ -510,7 +641,9 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_103245uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> ret <8 x double> %shuffle @@ -519,7 +652,9 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_1133uu67: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> ret <8 x double> %shuffle @@ -528,7 +663,9 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_0uu354uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> ret <8 x double> %shuffle @@ -537,7 +674,9 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_uuu3uu66: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> ret <8 x double> %shuffle @@ -546,9 +685,16 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_c348cda0: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm4 +; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0> ret <8 x double> %shuffle @@ -557,9 +703,17 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_f511235a: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,3,1,3] +; ALL-NEXT: vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10> ret <8 x double> %shuffle @@ -577,8 +731,9 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> ret <8 x i64> %shuffle @@ -587,8 +742,9 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> ret <8 x i64> %shuffle @@ -597,8 +753,9 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -607,8 +764,11 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00040000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -617,8 +777,11 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00500000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -627,8 +790,11 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_06000000: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -637,11 +803,11 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_70000000: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: movl $7, %eax -; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 -; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,0,0,0] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -650,7 +816,10 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vshufi64x2 $160, %zmm0, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> ret <8 x i64> %shuffle @@ -659,8 +828,9 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00112233: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> ret <8 x i64> %shuffle @@ -669,8 +839,9 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> ret <8 x i64> %shuffle @@ -679,7 +850,11 @@ define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_81a3c5e7: ; ALL: # BB#0: -; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> ret <8 x i64> %shuffle @@ -688,9 +863,10 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> ret <8 x i64> %shuffle @@ -699,9 +875,15 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> ret <8 x i64> %shuffle @@ -710,9 +892,13 @@ define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_8823cc67: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> ret <8 x i64> %shuffle @@ -721,9 +907,13 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> ret <8 x i64> %shuffle @@ -732,9 +922,13 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_9810dc54: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> ret <8 x i64> %shuffle @@ -743,9 +937,15 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08194c5d: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> ret <8 x i64> %shuffle @@ -754,9 +954,15 @@ define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_2a3b6e7f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> ret <8 x i64> %shuffle @@ -765,9 +971,13 @@ define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08192a3b: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x i64> %shuffle @@ -776,9 +986,11 @@ define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,0,1,1] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> ret <8 x i64> %shuffle @@ -787,9 +999,12 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_091b2d3f: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> ret <8 x i64> %shuffle @@ -798,9 +1013,11 @@ define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> ret <8 x i64> %shuffle @@ -809,7 +1026,10 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00014445: ; ALL: # BB#0: -; ALL-NEXT: vpermq $64, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> ret <8 x i64> %shuffle @@ -818,7 +1038,10 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00204464: ; ALL: # BB#0: -; ALL-NEXT: vpermq $32, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> ret <8 x i64> %shuffle @@ -827,7 +1050,10 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_03004744: ; ALL: # BB#0: -; ALL-NEXT: vpermq $12, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> ret <8 x i64> %shuffle @@ -836,7 +1062,10 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10005444: ; ALL: # BB#0: -; ALL-NEXT: vpermq $1, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> ret <8 x i64> %shuffle @@ -845,7 +1074,10 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_22006644: ; ALL: # BB#0: -; ALL-NEXT: vpermq $10, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> ret <8 x i64> %shuffle @@ -854,7 +1086,10 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_33307774: ; ALL: # BB#0: -; ALL-NEXT: vpermq $63, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> ret <8 x i64> %shuffle @@ -863,7 +1098,10 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_32107654: ; ALL: # BB#0: -; ALL-NEXT: vpermq $27, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> ret <8 x i64> %shuffle @@ -872,7 +1110,10 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00234467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> ret <8 x i64> %shuffle @@ -881,7 +1122,10 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> ret <8 x i64> %shuffle @@ -890,7 +1134,10 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10325476: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> ret <8 x i64> %shuffle @@ -899,7 +1146,10 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> ret <8 x i64> %shuffle @@ -908,7 +1158,10 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x i64> %shuffle @@ -917,7 +1170,10 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10225466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> ret <8 x i64> %shuffle @@ -926,8 +1182,10 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00015444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> ret <8 x i64> %shuffle @@ -936,8 +1194,10 @@ define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00204644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> ret <8 x i64> %shuffle @@ -946,8 +1206,10 @@ define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_03004474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> ret <8 x i64> %shuffle @@ -956,8 +1218,10 @@ define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10004444: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i64> %shuffle @@ -966,8 +1230,10 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_22006446: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> ret <8 x i64> %shuffle @@ -976,8 +1242,10 @@ define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_33307474: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> ret <8 x i64> %shuffle @@ -986,8 +1254,9 @@ define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_32104567: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -996,8 +1265,10 @@ define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00236744: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1006,8 +1277,10 @@ define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00226644: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1016,7 +1289,9 @@ define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10324567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1025,7 +1300,9 @@ define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11334567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1034,7 +1311,9 @@ define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1043,7 +1322,9 @@ define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01235466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> ret <8 x i64> %shuffle @@ -1052,8 +1333,10 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_002u6u44: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1062,8 +1345,10 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 -; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> ret <8 x i64> %shuffle @@ -1072,7 +1357,9 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_103245uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> ret <8 x i64> %shuffle @@ -1081,7 +1368,9 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_1133uu67: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1090,7 +1379,9 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_0uu354uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> ret <8 x i64> %shuffle @@ -1099,7 +1390,9 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_uuu3uu66: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> ret <8 x i64> %shuffle @@ -1108,9 +1401,15 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_6caa87e5: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 -; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5> ret <8 x i64> %shuffle diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll index a2f3d7b82b369..0a6eea049d372 100644 --- a/test/CodeGen/X86/widen_conv-3.ll +++ b/test/CodeGen/X86/widen_conv-3.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -; CHECK: cvtsi2ss +; CHECK: cvtdq2ps ; sign to float v2i16 to v2f32 diff --git a/test/CodeGen/X86/win64_params.ll b/test/CodeGen/X86/win64_params.ll index 9718c86300c25..a0b552d4d5847 100644 --- a/test/CodeGen/X86/win64_params.ll +++ b/test/CodeGen/X86/win64_params.ll @@ -7,8 +7,7 @@ define i32 @f6(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind re entry: ; CHECK: movl 48(%rsp), %eax ; CHECK: addl 40(%rsp), %eax -; LINUX: addl %r9d, %r8d -; LINUX: movl %r8d, %eax +; LINUX: leal (%r8,%r9), %eax %add = add nsw i32 %p6, %p5 ret i32 %add } @@ -27,10 +26,8 @@ entry: ; on other platforms here (note the x86_64_sysvcc calling convention). define x86_64_sysvcc i32 @f8(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize { entry: -; CHECK: addl %r9d, %r8d -; CHECK: movl %r8d, %eax -; LINUX: addl %r9d, %r8d -; LINUX: movl %r8d, %eax +; CHECK: leal (%r8,%r9), %eax +; LINUX: leal (%r8,%r9), %eax %add = add nsw i32 %p6, %p5 ret i32 %add } diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll index 199557dac2061..77c37b4d348e2 100644 --- a/test/CodeGen/X86/win_cst_pool.ll +++ b/test/CodeGen/X86/win_cst_pool.ll @@ -64,3 +64,16 @@ define <4 x float> @undef1() { ; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0 ; CHECK-NEXT: ret } + +define float @pr23966(i32 %a) { + %tobool = icmp ne i32 %a, 0 + %sel = select i1 %tobool, float -1.000000e+00, float 1.000000e+00 + ret float %sel +} + +; CHECK: .globl __real@bf8000003f800000 +; CHECK-NEXT: .section .rdata,"dr",discard,__real@bf8000003f800000 +; CHECK-NEXT: .align 4 +; CHECK-NEXT: __real@bf8000003f800000: +; CHECK-NEXT: .long 1065353216 +; CHECK-NEXT: .long 3212836864 diff --git a/test/CodeGen/X86/win_ftol2.ll b/test/CodeGen/X86/win_ftol2.ll index 14591248f354e..dfa6e3aa76bdd 100644 --- a/test/CodeGen/X86/win_ftol2.ll +++ b/test/CodeGen/X86/win_ftol2.ll @@ -142,3 +142,25 @@ define i64 @double_ui64_5(double %X) { %tmp.1 = fptoui double %X to i64 ret i64 %tmp.1 } + +define double @pr23957_32(double %A) { +; FTOL-LABEL: @pr23957_32 +; FTOL: fldl +; FTOL-NEXT: fld %st(0) +; FTOL-NEXT: calll __ftol2 + %B = fptoui double %A to i32 + %C = uitofp i32 %B to double + %D = fsub double %C, %A + ret double %D +} + +define double @pr23957_64(double %A) { +; FTOL-LABEL: @pr23957_64 +; FTOL: fldl +; FTOL-NEXT: fld %st(0) +; FTOL-NEXT: calll __ftol2 + %B = fptoui double %A to i64 + %C = uitofp i64 %B to double + %D = fsub double %C, %A + ret double %D +} diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll index 829be41e51279..f78fe27578651 100644 --- a/test/CodeGen/X86/xor.ll +++ b/test/CodeGen/X86/xor.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s -check-prefix=X32 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s -check-prefix=X32 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 | FileCheck %s -check-prefix=X64 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 | FileCheck %s -check-prefix=X64 @@ -193,3 +193,22 @@ define i32 @test11(i32 %b) { ; X32: movl $-2, %[[REG:.*]] ; X32: roll %{{.*}}, %[[REG]] } + +%struct.ref_s = type { %union.v, i16, i16 } +%union.v = type { i64 } + +define %struct.ref_s* @test12(%struct.ref_s* %op, i64 %osbot, i64 %intval) { + %neg = shl i64 %intval, 32 + %sext = xor i64 %neg, -4294967296 + %idx.ext = ashr exact i64 %sext, 32 + %add.ptr = getelementptr inbounds %struct.ref_s, %struct.ref_s* %op, i64 %idx.ext + ret %struct.ref_s* %add.ptr +; X64-LABEL: test12: +; X64: shlq $32, %[[REG:.*]] +; X64-NOT: not +; X64: sarq $28, %[[REG]] +; X32-LABEL: test12: +; X32: leal +; X32-NOT: not +; X32: shll $2, %eax +} |