summaryrefslogtreecommitdiff
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/StackColoring.ll33
-rw-r--r--test/CodeGen/X86/asm-mismatched-types.ll135
-rw-r--r--test/CodeGen/X86/asm-reject-reg-type-mismatch.ll6
-rw-r--r--test/CodeGen/X86/avx512-build-vector.ll10
-rw-r--r--test/CodeGen/X86/avx512-fma-intrinsics.ll579
-rw-r--r--test/CodeGen/X86/avx512-fma.ll122
-rw-r--r--test/CodeGen/X86/avx512-gather-scatter-intrin.ll748
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll177
-rw-r--r--test/CodeGen/X86/avx512-shuffle.ll392
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll79
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll1103
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll209
-rw-r--r--test/CodeGen/X86/coff-weak.ll9
-rw-r--r--test/CodeGen/X86/commute-two-addr.ll2
-rw-r--r--test/CodeGen/X86/dllexport-x86_64.ll60
-rw-r--r--test/CodeGen/X86/dllexport.ll71
-rw-r--r--test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll204
-rw-r--r--test/CodeGen/X86/fma-intrinsics-x86.ll493
-rw-r--r--test/CodeGen/X86/fma-intrinsics-x86_64.ll278
-rw-r--r--test/CodeGen/X86/fma-phi-213-to-231.ll283
-rw-r--r--test/CodeGen/X86/fma.ll83
-rwxr-xr-xtest/CodeGen/X86/fma3-intrinsics.ll150
-rw-r--r--test/CodeGen/X86/fold-load-binops.ll142
-rw-r--r--test/CodeGen/X86/fold-vector-sext-crash2.ll92
-rw-r--r--test/CodeGen/X86/fold-vector-shl-crash.ll9
-rw-r--r--test/CodeGen/X86/fp-fast.ll78
-rw-r--r--test/CodeGen/X86/implicit-null-check-negative.ll11
-rw-r--r--test/CodeGen/X86/implicit-null-check.ll28
-rw-r--r--test/CodeGen/X86/machine-combiner.ll99
-rw-r--r--test/CodeGen/X86/movtopush.ll93
-rw-r--r--test/CodeGen/X86/or-branch.ll31
-rw-r--r--test/CodeGen/X86/pr23900.ll29
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll38
-rw-r--r--test/CodeGen/X86/rrlist-livereg-corrutpion.ll26
-rw-r--r--test/CodeGen/X86/sdiv-exact.ll13
-rw-r--r--test/CodeGen/X86/seh-catch-all-win32.ll29
-rw-r--r--test/CodeGen/X86/seh-filter-no-personality.ll33
-rw-r--r--test/CodeGen/X86/seh-safe-div-win32.ll46
-rw-r--r--test/CodeGen/X86/shift-combine.ll59
-rw-r--r--test/CodeGen/X86/sqrt-fastmath.ll70
-rw-r--r--test/CodeGen/X86/stack-folding-fp-sse42.ll8
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx2.ll16
-rw-r--r--test/CodeGen/X86/statepoint-stackmap-format.ll5
-rw-r--r--test/CodeGen/X86/system-intrinsics-64.ll33
-rw-r--r--test/CodeGen/X86/system-intrinsics.ll17
-rw-r--r--test/CodeGen/X86/twoaddr-lea.ll3
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll50
-rw-r--r--test/CodeGen/X86/vec_shift8.ll527
-rw-r--r--test/CodeGen/X86/vector-sext.ll40
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll1041
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll767
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll778
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll548
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll639
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll459
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll42
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll24
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll95
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll13
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll713
-rw-r--r--test/CodeGen/X86/widen_conv-3.ll2
-rw-r--r--test/CodeGen/X86/win64_params.ll9
-rw-r--r--test/CodeGen/X86/win_cst_pool.ll13
-rw-r--r--test/CodeGen/X86/win_ftol2.ll22
-rw-r--r--test/CodeGen/X86/xor.ll21
65 files changed, 9322 insertions, 2715 deletions
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index 414ccf469ebbd..634f66ad52dea 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -1,9 +1,10 @@
-; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
-; RUN: llc -mcpu=corei7 -no-stack-coloring=true < %s | FileCheck %s --check-prefix=NOCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR --check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true < %s | FileCheck %s --check-prefix=NOCOLOR --check-prefix=CHECK
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
+;CHECK-LABEL: myCall_w2:
;YESCOLOR: subq $144, %rsp
;NOCOLOR: subq $272, %rsp
@@ -28,6 +29,7 @@ entry:
}
+;CHECK-LABEL: myCall2_no_merge
;YESCOLOR: subq $272, %rsp
;NOCOLOR: subq $272, %rsp
@@ -56,6 +58,7 @@ bb3:
ret i32 0
}
+;CHECK-LABEL: myCall2_w2
;YESCOLOR: subq $144, %rsp
;NOCOLOR: subq $272, %rsp
@@ -82,12 +85,11 @@ bb2:
bb3:
ret i32 0
}
+
+;CHECK-LABEL: myCall_w4:
;YESCOLOR: subq $200, %rsp
;NOCOLOR: subq $408, %rsp
-
-
-
define i32 @myCall_w4(i32 %in) {
entry:
%a1 = alloca [14 x i8*], align 8
@@ -119,6 +121,7 @@ entry:
ret i32 %t7
}
+;CHECK-LABEL: myCall2_w4:
;YESCOLOR: subq $112, %rsp
;NOCOLOR: subq $400, %rsp
@@ -158,6 +161,7 @@ bb3:
}
+;CHECK-LABEL: myCall2_noend:
;YESCOLOR: subq $144, %rsp
;NOCOLOR: subq $272, %rsp
@@ -185,6 +189,7 @@ bb3:
ret i32 0
}
+;CHECK-LABEL: myCall2_noend2:
;YESCOLOR: subq $144, %rsp
;NOCOLOR: subq $272, %rsp
define i32 @myCall2_noend2(i32 %in, i1 %d) {
@@ -211,6 +216,7 @@ bb3:
}
+;CHECK-LABEL: myCall2_nostart:
;YESCOLOR: subq $144, %rsp
;NOCOLOR: subq $272, %rsp
define i32 @myCall2_nostart(i32 %in, i1 %d) {
@@ -236,6 +242,7 @@ bb3:
}
; Adopt the test from Transforms/Inline/array_merge.ll'
+;CHECK-LABEL: array_merge:
;YESCOLOR: subq $816, %rsp
;NOCOLOR: subq $1616, %rsp
define void @array_merge() nounwind ssp {
@@ -261,6 +268,7 @@ entry:
ret void
}
+;CHECK-LABEL: func_phi_lifetime:
;YESCOLOR: subq $272, %rsp
;NOCOLOR: subq $272, %rsp
define i32 @func_phi_lifetime(i32 %in, i1 %d) {
@@ -297,8 +305,7 @@ bb3:
}
-;YESCOLOR-LABEL: multi_region_bb:
-;NOCOLOR-LABEL: multi_region_bb:
+;CHECK-LABEL: multi_region_bb:
define void @multi_region_bb() nounwind ssp {
entry:
%A.i1 = alloca [100 x i32], align 4
@@ -323,10 +330,9 @@ entry:
call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
ret void
}
-
-
;YESCOLOR: subq $272, %rsp
;NOCOLOR: subq $272, %rsp
+
define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
entry:
%a = alloca [17 x i8*], align 8
@@ -353,9 +359,8 @@ bb3:
; Regression test for PR15707. %buf1 and %buf2 should not be merged
; in this test case.
-;YESCOLOR-LABEL: myCall_pr15707:
+;CHECK-LABEL: myCall_pr15707:
;YESCOLOR: subq $200008, %rsp
-;NOCOLOR-LABEL: myCall_pr15707:
;NOCOLOR: subq $200008, %rsp
define void @myCall_pr15707() {
%buf1 = alloca i8, i32 100000, align 16
@@ -374,8 +379,7 @@ define void @myCall_pr15707() {
; Check that we don't assert and crash even when there are allocas
; outside the declared lifetime regions.
-;YESCOLOR-LABEL: bad_range:
-;NOCOLOR-LABEL: bad_range:
+;CHECK-LABEL: bad_range:
define void @bad_range() nounwind ssp {
entry:
%A.i1 = alloca [100 x i32], align 4
@@ -400,8 +404,7 @@ block2:
; Check that we don't assert and crash even when there are usages
; of allocas which do not read or write outside the declared lifetime regions.
-;YESCOLOR-LABEL: shady_range:
-;NOCOLOR-LABEL: shady_range:
+;CHECK-LABEL: shady_range:
%struct.Klass = type { i32, i32 }
diff --git a/test/CodeGen/X86/asm-mismatched-types.ll b/test/CodeGen/X86/asm-mismatched-types.ll
new file mode 100644
index 0000000000000..97f9c0872f8f5
--- /dev/null
+++ b/test/CodeGen/X86/asm-mismatched-types.ll
@@ -0,0 +1,135 @@
+; RUN: llc -o - %s -no-integrated-as | FileCheck %s
+target triple = "x86_64--"
+
+; Allow to specify any of the 8/16/32/64 register names interchangeably in
+; constraints
+
+; Produced by C-programs like this:
+; void foo(int p) { register int reg __asm__("r8") = p;
+; __asm__ __volatile__("# REG: %0" : : "r" (reg)); }
+
+; CHECK-LABEL: reg64_as_32:
+; CHECK: # REG: %r8d
+define void @reg64_as_32(i32 %p) {
+ call void asm sideeffect "# REG: $0", "{r8}"(i32 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg64_as_32_float:
+; CHECK: # REG: %r8d
+define void @reg64_as_32_float(float %p) {
+ call void asm sideeffect "# REG: $0", "{r8}"(float %p)
+ ret void
+}
+
+; CHECK-LABEL: reg64_as_16:
+; CHECK: # REG: %r9w
+define void @reg64_as_16(i16 %p) {
+ call void asm sideeffect "# REG: $0", "{r9}"(i16 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg64_as_8:
+; CHECK: # REG: %bpl
+define void @reg64_as_8(i8 %p) {
+ call void asm sideeffect "# REG: $0", "{rbp}"(i8 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg32_as_16:
+; CHECK: # REG: %r15w
+define void @reg32_as_16(i16 %p) {
+ call void asm sideeffect "# REG: $0", "{r15d}"(i16 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg32_as_8:
+; CHECK: # REG: %r12b
+define void @reg32_as_8(i8 %p) {
+ call void asm sideeffect "# REG: $0", "{r12d}"(i8 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg16_as_8:
+; CHECK: # REG: %cl
+define void @reg16_as_8(i8 %p) {
+ call void asm sideeffect "# REG: $0", "{cx}"(i8 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg32_as_64:
+; CHECK: # REG: %rbp
+define void @reg32_as_64(i64 %p) {
+ call void asm sideeffect "# REG: $0", "{ebp}"(i64 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg32_as_64_float:
+; CHECK: # REG: %rbp
+define void @reg32_as_64_float(double %p) {
+ call void asm sideeffect "# REG: $0", "{ebp}"(double %p)
+ ret void
+}
+
+; CHECK-LABEL: reg16_as_64:
+; CHECK: # REG: %r13
+define void @reg16_as_64(i64 %p) {
+ call void asm sideeffect "# REG: $0", "{r13w}"(i64 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg16_as_64_float:
+; CHECK: # REG: %r13
+define void @reg16_as_64_float(double %p) {
+ call void asm sideeffect "# REG: $0", "{r13w}"(double %p)
+ ret void
+}
+
+; CHECK-LABEL: reg8_as_64:
+; CHECK: # REG: %rax
+define void @reg8_as_64(i64 %p) {
+ call void asm sideeffect "# REG: $0", "{al}"(i64 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg8_as_64_float:
+; CHECK: # REG: %rax
+define void @reg8_as_64_float(double %p) {
+ call void asm sideeffect "# REG: $0", "{al}"(double %p)
+ ret void
+}
+
+; CHECK-LABEL: reg16_as_32:
+; CHECK: # REG: %r11d
+define void @reg16_as_32(i32 %p) {
+ call void asm sideeffect "# REG: $0", "{r11w}"(i32 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg16_as_32_float:
+; CHECK: # REG: %r11d
+define void @reg16_as_32_float(float %p) {
+ call void asm sideeffect "# REG: $0", "{r11w}"(float %p)
+ ret void
+}
+
+; CHECK-LABEL: reg8_as_32:
+; CHECK: # REG: %r9d
+define void @reg8_as_32(i32 %p) {
+ call void asm sideeffect "# REG: $0", "{r9b}"(i32 %p)
+ ret void
+}
+
+; CHECK-LABEL: reg8_as_32_float:
+; CHECK: # REG: %r9d
+define void @reg8_as_32_float(float %p) {
+ call void asm sideeffect "# REG: $0", "{r9b}"(float %p)
+ ret void
+}
+
+; CHECK-LABEL: reg8_as_16:
+; CHECK: # REG: %di
+define void @reg8_as_16(i16 %p) {
+ call void asm sideeffect "# REG: $0", "{dil}"(i16 %p)
+ ret void
+}
diff --git a/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll b/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll
index 016e2d261eef6..c7e86f565eefa 100644
--- a/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll
+++ b/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll
@@ -1,10 +1,8 @@
-; RUN: not llc -no-integrated-as %s -o - 2> %t1
-; RUN: FileCheck %s < %t1
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+; RUN: not llc -o /dev/null %s 2>&1 | FileCheck %s
target triple = "x86_64--"
; CHECK: error: couldn't allocate output register for constraint '{ax}'
define i128 @blup() {
- %v = tail call i128 asm "", "={ax},0,~{dirflag},~{fpsr},~{flags}"(i128 0)
+ %v = tail call i128 asm "", "={ax},0"(i128 0)
ret i128 %v
}
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index e70d9f3ad521c..e5373c575c1ad 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -1,15 +1,5 @@
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-define <16 x i32> @test1(i32* %x) {
-; CHECK-LABEL: test1:
-; CHECK: vmovd (%rdi), %xmm
-; CHECK: vmovdqa32
-; CHECK: vpermt2d %zmm
- %y = load i32, i32* %x, align 4
- %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4
- ret <16 x i32>%res
-}
-
define <16 x i32> @test2(<16 x i32> %x) {
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index 9814a6108272a..c30fc909f09b5 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1,422 +1,675 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s
-declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfmsubpd_z
- ; CHECK: vfmsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <8 x double> @test_mask_vfmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsub_pd
- ; CHECK: vfmsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
- ret <8 x double> %res
-}
+declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfnmadd_ps_z
; CHECK: vfnmadd213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_vfnmadd_ps
; CHECK: vfnmadd213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfnmadd_pd_z
; CHECK: vfnmadd213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd_pd
; CHECK: vfnmadd213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfnmsubps_z
; CHECK: vfnmsub213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_vfnmsub_ps
; CHECK: vfnmsub213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfnmsubpd_z
; CHECK: vfnmsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub_pd
; CHECK: vfnmsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfmaddsubps_z
; CHECK: vfmaddsub213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
; CHECK-LABEL: test_mask_fmaddsub_ps:
; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
ret <16 x float> %res
}
-declare <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfmaddsubpd_z
; CHECK: vfmaddsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmaddsub_pd
; CHECK: vfmaddsub213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
- ret <8 x double> %res
-}
-
-define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_x86_vfmsubaddps_z
- ; CHECK: vfmsubadd213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @test_mask_vfmsubadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubadd_ps
- ; CHECK: vfmsubadd213ps %zmm
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
- ret <16 x float> %res
-}
-
-define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfmsubaddpd_z
- ; CHECK: vfmsubadd213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
-declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
-define <8 x double> @test_mask_vfmsubadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubadd_pd
- ; CHECK: vfmsubadd213pd %zmm
- %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
- ret <8 x double> %res
+define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmaddsub231ps %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsubadd231ps %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne
; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn
; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp
; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz
; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current
; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne
; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn
; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp
; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz
; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current
; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rne
- ; CHECK: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtn
- ; CHECK: vfmsub213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtp
- ; CHECK: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtz
- ; CHECK: vfmsub213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_current
- ; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rne
- ; CHECK: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtn
- ; CHECK: vfmsub213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtp
- ; CHECK: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
- ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtz
- ; CHECK: vfmsub213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
-define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_current
- ; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xaa,0xc2]
- %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
- ret <16 x float> %res
+declare <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsub231ps %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne
; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn
; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp
; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz
; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current
; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne
; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn
; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp
; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz
; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current
; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
+define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask3_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231ps %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne
; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn
; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp
; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz
; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current
; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne
; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn
; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp
; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz
; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current
; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
- %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
+
+define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfnmsub231ps %zmm1, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vfnmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
index d6926e2571abd..ed046de005cf6 100644
--- a/test/CodeGen/X86/avx512-fma.ll
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefix=SKX
; CHECK-LABEL: test_x86_fmadd_ps_z
; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0
@@ -58,26 +59,129 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8
ret <8 x double> %res
}
-define double @test_x86_fmsub_sd_z(double %a0, double %a1, double %a2) {
+define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
+; CHECK-LABEL: test_x86_fmsub_213:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
ret double %res
}
-;CHECK-LABEL: test132_br
-;CHECK: vfmadd132ps LCP{{.*}}(%rip){1to16}
-;CHECK: ret
-define <16 x float> @test132_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
+; CHECK-LABEL: test_x86_fmsub_213_m:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a2 = load double , double *%a2_ptr
+ %x = fmul double %a0, %a1
+ %res = fsub double %x, %a2
+ ret double %res
+}
+
+define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
+; CHECK-LABEL: test_x86_fmsub_231_m:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a2 = load double , double *%a2_ptr
+ %x = fmul double %a0, %a2
+ %res = fsub double %x, %a1
+ ret double %res
+}
+
+define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+; CHECK-LABEL: test231_br:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
%b2 = fadd <16 x float> %b1, %a2
ret <16 x float> %b2
}
-;CHECK-LABEL: test213_br
-;CHECK: vfmadd213ps LCP{{.*}}(%rip){1to16}
-;CHECK: ret
define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+; CHECK-LABEL: test213_br:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
+; CHECK-NEXT: retq
%b1 = fmul <16 x float> %a1, %a2
%b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
ret <16 x float> %b2
}
+
+;mask (a*c+b , a)
+define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; CHECK-LABEL: test_x86_fmadd132_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2
+; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
+; CHECK-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmadd132_ps:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
+ %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+ %x = fmul <16 x float> %a0, %a2
+ %y = fadd <16 x float> %x, %a1
+ %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a0
+ ret <16 x float> %res
+}
+
+;mask (a*c+b , b)
+define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; CHECK-LABEL: test_x86_fmadd231_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2
+; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
+; CHECK-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmadd231_ps:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+ %x = fmul <16 x float> %a0, %a2
+ %y = fadd <16 x float> %x, %a1
+ %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
+ ret <16 x float> %res
+}
+
+;mask (b*a+c , b)
+define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; CHECK-LABEL: test_x86_fmadd213_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm2, %zmm2
+; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; CHECK-NEXT: vptestmd %zmm2, %zmm2, %k1
+; CHECK-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmadd213_ps:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+ %x = fmul <16 x float> %a1, %a0
+ %y = fadd <16 x float> %x, %a2
+ %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
+ ret <16 x float> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 0e32a1c280676..3fca5a89a6a48 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
@@ -10,52 +10,60 @@ declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>,
declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
-;CHECK-LABEL: gather_mask_dps
-;CHECK: kmovw
-;CHECK: vgatherdps
-;CHECK: vpadd
-;CHECK: vscatterdps
-;CHECK: ret
define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_dps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
ret void
}
-;CHECK-LABEL: gather_mask_dpd
-;CHECK: kmovw
-;CHECK: vgatherdpd
-;CHECK: vpadd
-;CHECK: vscatterdpd
-;CHECK: ret
define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_dpd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
ret void
}
-;CHECK-LABEL: gather_mask_qps
-;CHECK: kmovw
-;CHECK: vgatherqps
-;CHECK: vpadd
-;CHECK: vscatterqps
-;CHECK: ret
define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_qps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
ret void
}
-;CHECK-LABEL: gather_mask_qpd
-;CHECK: kmovw
-;CHECK: vgatherqpd
-;CHECK: vpadd
-;CHECK: vscatterqpd
-;CHECK: ret
define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_qpd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
@@ -74,162 +82,710 @@ declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i3
declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
-;CHECK-LABEL: gather_mask_dd
-;CHECK: kmovw
-;CHECK: vpgatherdd
-;CHECK: vpadd
-;CHECK: vpscatterdd
-;CHECK: ret
define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_dd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
ret void
}
-;CHECK-LABEL: gather_mask_qd
-;CHECK: kmovw
-;CHECK: vpgatherqd
-;CHECK: vpadd
-;CHECK: vpscatterqd
-;CHECK: ret
define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_qd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
ret void
}
-;CHECK-LABEL: gather_mask_qq
-;CHECK: kmovw
-;CHECK: vpgatherqq
-;CHECK: vpadd
-;CHECK: vpscatterqq
-;CHECK: ret
define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_qq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
ret void
}
-;CHECK-LABEL: gather_mask_dq
-;CHECK: kmovw
-;CHECK: vpgatherdq
-;CHECK: vpadd
-;CHECK: vpscatterdq
-;CHECK: ret
define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_mask_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
ret void
}
-
-;CHECK-LABEL: gather_mask_dpd_execdomain
-;CHECK: vgatherdpd
-;CHECK: vmovapd
-;CHECK: ret
define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
+; CHECK-LABEL: gather_mask_dpd_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, (%rdx)
+; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
ret void
}
-;CHECK-LABEL: gather_mask_qpd_execdomain
-;CHECK: vgatherqpd
-;CHECK: vmovapd
-;CHECK: ret
define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
+; CHECK-LABEL: gather_mask_qpd_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, (%rdx)
+; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
ret void
}
-;CHECK-LABEL: gather_mask_dps_execdomain
-;CHECK: vgatherdps
-;CHECK: vmovaps
-;CHECK: ret
define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
+; CHECK-LABEL: gather_mask_dps_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
ret <16 x float> %res;
}
-;CHECK-LABEL: gather_mask_qps_execdomain
-;CHECK: vgatherqps
-;CHECK: vmovaps
-;CHECK: ret
define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
+; CHECK-LABEL: gather_mask_qps_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
ret <8 x float> %res;
}
-;CHECK-LABEL: scatter_mask_dpd_execdomain
-;CHECK: vmovapd
-;CHECK: vscatterdpd
-;CHECK: ret
define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = load <8 x double>, <8 x double>* %src, align 64
+; CHECK-LABEL: scatter_mask_dpd_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
+ %x = load <8 x double>, <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
ret void
}
-;CHECK-LABEL: scatter_mask_qpd_execdomain
-;CHECK: vmovapd
-;CHECK: vscatterqpd
-;CHECK: ret
define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: scatter_mask_qpd_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = load <8 x double>, <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
ret void
}
-;CHECK-LABEL: scatter_mask_dps_execdomain
-;CHECK: vmovaps
-;CHECK: vscatterdps
-;CHECK: ret
define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: scatter_mask_dps_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = load <16 x float>, <16 x float>* %src, align 64
call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
ret void
}
-;CHECK-LABEL: scatter_mask_qps_execdomain
-;CHECK: vmovaps
-;CHECK: vscatterqps
-;CHECK: ret
define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = load <8 x float>, <8 x float>* %src, align 32
+; CHECK-LABEL: scatter_mask_qps_execdomain:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
+ %x = load <8 x float>, <8 x float>* %src, align 32
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
ret void
}
-;CHECK-LABEL: gather_qps
-;CHECK: kxnorw
-;CHECK: vgatherqps
-;CHECK: vpadd
-;CHECK: vscatterqps
-;CHECK: ret
define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
+; CHECK-LABEL: gather_qps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
+; CHECK-NEXT: retq
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
ret void
}
-;CHECK-LABEL: prefetch
-;CHECK: gatherpf0
-;CHECK: gatherpf1
-;CHECK: scatterpf0
-;CHECK: scatterpf1
-;CHECK: ret
declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
define void @prefetch(<8 x i64> %ind, i8* %base) {
+; CHECK-LABEL: prefetch:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
+; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
+; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
+; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
+; CHECK-NEXT: retq
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
ret void
}
+
+
+declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32)
+
+define <4 x i32>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
+; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
+ %res1 = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32)
+
+define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,0), %ymm0 {%k1}
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
+ %res1 = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherqps (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 0)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32)
+
+define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
+; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4)
+ %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherqps (%rdi,%ymm1,0), %xmm0 {%k1}
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 0)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32)
+
+define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
+; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32)
+
+define <4 x i32>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
+; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
+ %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32)
+
+define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,0), %ymm0 {%k1}
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
+; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
+ %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherdps (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 0)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32)
+
+define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
+; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,0), %xmm0 {%k1}
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
+ %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 0)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32)
+
+define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vgatherdps (%rdi,%ymm1,0), %ymm0 {%k1}
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: kmovw %k1, %k2
+; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
+; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,0), %ymm0 {%k1}
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 0)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32)
+
+define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
+
+define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32)
+
+define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
+
+define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32)
+
+define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
+
+define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32)
+
+define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
+
+define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 0)
+ call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32)
+
+define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
+
+define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32)
+
+define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
+
+define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: kxnorw %k2, %k2, %k2
+; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,0) {%k2}
+; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32)
+
+define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
+
+define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32)
+
+define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
+ ret void
+}
+
+declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
+
+define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,0) {%k1}
+; CHECK-NEXT: kxnorw %k1, %k1, %k1
+; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 0)
+ call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
+ ret void
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index a06cadaa3f5ab..b9f490b8a39af 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -489,19 +489,31 @@ declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>
}
declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
- define <16 x i32> @test_pabsd(<16 x i32> %a) {
- ;CHECK: vpabsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0]
- %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a, <16 x i32>zeroinitializer, i16 -1)
- ret < 16 x i32> %res
- }
declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
- define <8 x i64> @test_pabsq(<8 x i64> %a) {
- ;CHECK: vpabsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0]
- %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a, <8 x i64>zeroinitializer, i8 -1)
- ret <8 x i64> %res
- }
- declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsd{{.*}}{%k1}
+define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsq{{.*}}{%k1}
+define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK: vptestmq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
@@ -3013,3 +3025,146 @@ define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %
%res2 = add <8 x i64> %res, %res1
ret <8 x i64> %res2
}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2d {{.*}}{%k1}
+define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2pd {{.*}}{%k1}
+define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
+ %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2ps {{.*}}{%k1}
+define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
+ %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2q {{.*}}{%k1}
+define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2d {{.*}}{%k1} {z}
+define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+ %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2pd {{.*}}{%k1} {z}
+define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+ %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2ps {{.*}}{%k1} {z}
+define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+ %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2q {{.*}}{%k1} {z}
+define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2d {{.*}}{%k1}
+; CHECK-NOT: {z}
+define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefpd{{.*}}{%k1}
+define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+ %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefps{{.*}}{%k1}
+define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+ %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
deleted file mode 100644
index 7e9eda58737d1..0000000000000
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ /dev/null
@@ -1,392 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX
-
-; CHECK-LABEL: test1:
-; CHECK: vpermps
-; CHECK: ret
-define <16 x float> @test1(<16 x float> %a) nounwind {
- %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
- ret <16 x float> %c
-}
-
-; CHECK-LABEL: test2:
-; CHECK: vpermd
-; CHECK: ret
-define <16 x i32> @test2(<16 x i32> %a) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test3:
-; CHECK: vpermq
-; CHECK: ret
-define <8 x i64> @test3(<8 x i64> %a) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test4:
-; CHECK: vpermpd
-; CHECK: ret
-define <8 x double> @test4(<8 x double> %a) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test5:
-; CHECK: vpermt2pd
-; CHECK: ret
-define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test6:
-; CHECK: vpermq $30
-; CHECK: ret
-define <8 x i64> @test6(<8 x i64> %a) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test7:
-; CHECK: vpermt2q
-; CHECK: ret
-define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test8:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test9:
-; CHECK: vpermt2ps
-; CHECK: ret
-define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
- %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x float> %c
-}
-
-; CHECK-LABEL: test10:
-; CHECK: vpermt2ps (
-; CHECK: ret
-define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
- %c = load <16 x float>, <16 x float>* %b
- %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x float> %d
-}
-
-; CHECK-LABEL: test11:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
- %c = load <16 x i32>, <16 x i32>* %b
- %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x i32> %d
-}
-
-; CHECK-LABEL: test13
-; CHECK: vpermilps $177, %zmm
-; CHECK: ret
-define <16 x float> @test13(<16 x float> %a) {
- %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test14
-; CHECK: vpermilpd $203, %zmm
-; CHECK: ret
-define <8 x double> @test14(<8 x double> %a) {
- %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
- ret <8 x double> %b
-}
-
-; CHECK-LABEL: test15
-; CHECK: vpshufd $177, %zmm
-; CHECK: ret
-define <16 x i32> @test15(<16 x i32> %a) {
-; mask 1-0-3-2 = 10110001 = 0xb1 = 177
- %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i32> %b
-}
-; CHECK-LABEL: test16
-; CHECK: valignq $3, %zmm0, %zmm1
-; CHECK: ret
-define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test17
-; CHECK: vshufpd $19, %zmm1, %zmm0
-; CHECK: ret
-define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test18
-; CHECK: vpunpckhdq %zmm
-; CHECK: ret
-define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test19
-; CHECK: vpunpckldq %zmm
-; CHECK: ret
-define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test20
-; CHECK: vpunpckhqdq %zmm
-; CHECK: ret
-define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
- %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
- ret <8 x i64> %b
-}
-
-; CHECK-LABEL: test21
-; CHECK: vbroadcastsd %xmm0, %zmm
-; CHECK: ret
-define <8 x double> @test21(<8 x double> %a, <8 x double> %b) {
- %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- ret <8 x double> %shuffle
-}
-
-; CHECK-LABEL: test22
-; CHECK: vpbroadcastq %xmm0, %zmm
-; CHECK: ret
-define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) {
- %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- ret <8 x i64> %shuffle
-}
-
-; CHECK-LABEL: @test23
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: ret
-define <16 x i32> @test23(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test24
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 25, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test25
-; CHECK: vshufps $52
-; CHECK: ret
-define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
-; mask - 0-1-3-0 00110100 = 0x34 = 52
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 16, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test26
-; CHECK: vmovshdup
-; CHECK: ret
-define <16 x i32> @test26(<16 x i32> %a) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test27
-; CHECK: ret
-define <16 x i32> @test27(<4 x i32>%a) {
- %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: test28
-; CHECK: vpshufhw $177, %ymm
-; CHECK: ret
-define <16 x i16> @test28(<16 x i16> %a) {
- %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32><i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i16> %b
-}
-
-; CHECK-LABEL: test29
-; CHECK: vunpcklps %zmm
-; CHECK: ret
-define <16 x float> @test29(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: @test30
-; CHECK: vshufps $144, %zmm
-; CHECK: ret
-define <16 x float> @test30(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test31
-; CHECK: valignd $3, %zmm0, %zmm1
-; CHECK: ret
-define <16 x i32> @test31(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test32
-; CHECK: vshufpd $99, %zmm0, %zmm1
-; CHECK: ret
-define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 1, i32 10, i32 2, i32 undef, i32 5, i32 15, i32 undef>
- ret <8 x double> %c
-}
-
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
-; CHECK-LABEL: test_vshuff64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
- ret <8 x double> %res
-}
-
-define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
-; CHECK-LABEL: test_vshuff64x2_512_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
- %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
- ret <8 x double> %res
-}
-
-define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
-; CHECK-LABEL: test_vshufi64x2_512_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 4, i32 5>
- %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
- ret <8 x i64> %res
-}
-
-define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
-; CHECK-LABEL: test_vshuff64x2_512_mem:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vshuff64x2 $40, %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %x1 = load <8 x double>,<8 x double> *%ptr,align 1
- %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 0, i32 1>
- ret <8 x double> %res
-}
-
-define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind {
-; CHECK-LABEL: test_vshuff32x4_512_mem:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vshuff64x2 $20, %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %x1 = load <16 x float>,<16 x float> *%ptr,align 1
- %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
- ret <16 x float> %res
-}
-
-define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
-; CHECK-LABEL: test_align_v16i32_rr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- ret <16 x i32> %c
-}
-
-define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind {
-; CHECK-LABEL: test_align_v16i32_rm:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0
-; CHECK-NEXT: retq
- %a = load <16 x i32>, <16 x i32>* %a.ptr
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- ret <16 x i32> %c
-}
-
-define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind {
-; CHECK-LABEL: test_align_v16i32_rm_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
-;
-; CHECK-SKX-LABEL: test_align_v16i32_rm_mask:
-; CHECK-SKX: ## BB#0:
-; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1
-; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1
-; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
-; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-SKX-NEXT: retq
- %a = load <16 x i32>, <16 x i32>* %a.ptr
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a
- ret <16 x i32> %res
-}
-
-define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind {
-; CHECK-LABEL: test_align_v8f64_rr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- ret <8 x double> %c
-}
-
-define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind {
-; CHECK-LABEL: test_align_v18f64_rm:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0
-; CHECK-NEXT: retq
- %a = load <8 x double>, <8 x double>* %a.ptr
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- ret <8 x double> %c
-}
-
-define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind {
-; CHECK-LABEL: test_align_v18f64_rm_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
-;
-; CHECK-SKX-LABEL: test_align_v18f64_rm_mask:
-; CHECK-SKX: ## BB#0:
-; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1
-; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-SKX-NEXT: retq
- %a = load <8 x double>, <8 x double>* %a.ptr
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer
- ret <8 x double> %res
-}
-
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 9ee0e09d1b7a2..9574c016ad509 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -893,6 +893,45 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
ret <32 x i16> %res2
}
+declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2w %zmm{{.*}}{%k1}
+define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+ %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2w %zmm{{.*}}{%k1} {z}
+define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+ %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2w %zmm{{.*}}{%k1}
+define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+ %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_512
@@ -918,3 +957,43 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16>
%res2 = add <32 x i16> %res, %res1
ret <32 x i16> %res2
}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpshufb %zmm{{.*}}{%k1}
+define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsw{{.*}}{%k1}
+define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_512
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsb{{.*}}{%k1}
+define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index cf8c32a48b6b0..0119d3945f4e8 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -612,248 +612,925 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
-declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd256_ps
; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
- %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
-declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps
; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
-declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd256_pd:
; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
ret <4 x double> %res
}
-declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd128_pd:
; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
ret <2 x double> %res
}
-declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
-
-define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsub256_ps
- ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2]
- %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
- ret <8 x float> %res
-}
-
-declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsub128_ps
- ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
- ret <4 x float> %res
-}
-
-declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
-
-define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsub256_pd
- ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
- ret <4 x double> %res
-}
-
-declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
-
-define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsub128_pd
- ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
- ret <2 x double> %res
-}
-
-declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd256_ps
; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
- %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
-declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd128_ps
; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
-declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd256_pd
; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
-declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd128_pd
; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
-declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub256_ps
; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
- %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
-declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub128_ps
; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
-declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub256_pd
; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
-declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub128_pd
; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
-declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub256_ps:
; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
- %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
ret <8 x float> %res
}
-declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub128_ps:
; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
ret <4 x float> %res
}
-declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmaddsub256_pd
; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
-declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmaddsub128_pd
; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
-declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
-
-define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubadd256_ps
- ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2]
- %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
- ret <8 x float> %res
-}
-
-declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubadd128_ps
- ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
- ret <4 x float> %res
-}
-
-declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
-
-define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubadd256_pd
- ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
- ret <4 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
-
-define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubadd128_pd
- ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
- ret <2 x double> %res
+define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2=fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2=fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2=fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2=fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
}
-define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd
- ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07]
- %a2 = load <2 x double>, <2 x double>* %ptr_a2
- %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
- ret <2 x double> %res
-}
-declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
-define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmsubaddrm_pd
- ; CHECK: vfmsubadd213pd (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07]
- %a2 = load <8 x double>, <8 x double>* %ptr_a2, align 8
- %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
- ret <8 x double> %res
-}
define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps_r
; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rz
; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
@@ -861,7 +1538,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1,
; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -869,7 +1546,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1
; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -877,7 +1554,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1
; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
@@ -885,7 +1562,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a
; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
@@ -897,7 +1574,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -909,7 +1586,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -921,7 +1598,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
ret <4 x float> %res
}
@@ -933,21 +1610,21 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
- %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
ret <4 x float> %res
}
define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_pd_r
; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; CHECK-LABEL: test_mask_vfmadd128_pd_rz
; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
- %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
ret <2 x double> %res
}
@@ -955,7 +1632,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %
; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
- %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
@@ -963,21 +1640,21 @@ define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double>
; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
- %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
ret <2 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd256_pd_r
; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; CHECK-LABEL: test_mask_vfmadd256_pd_rz
; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
- %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
ret <4 x double> %res
}
@@ -985,7 +1662,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %
; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
- %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
@@ -993,7 +1670,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double>
; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
- %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
+ %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
ret <4 x double> %res
}
define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
@@ -2877,6 +3554,85 @@ define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16
ret <16 x i16> %res2
}
+declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2w %xmm{{.*}}{%k1}
+; CHECK-NOT: {z}
+define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+ %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2w %xmm{{.*}}{%k1} {z}
+define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+ %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2w %ymm{{.*}}{%k1}
+define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+ %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2w %ymm{{.*}}{%k1} {z}
+define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+ %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2w %xmm{{.*}}{%k1}
+define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+ %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2w %ymm{{.*}}{%k1}
+define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+ %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_128
@@ -2928,3 +3684,82 @@ define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16>
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpshufb %xmm{{.*}}{%k1}
+define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpshufb %ymm{{.*}}{%k1}
+define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsb{{.*}}{%k1}
+define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsb{{.*}}{%k1}
+define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsw{{.*}}{%k1}
+define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsw{{.*}}{%k1}
+define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index dfd4986b85c1e..fb7c93dc53b3a 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -2794,4 +2794,213 @@ define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
ret <4 x i64> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2d %xmm{{.*}}{%k1}
+; CHECK-NOT: {z}
+define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2d %xmm{{.*}}{%k1} {z}
+define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+ %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2d %ymm{{.*}}{%k1}
+; CHECK-NOT: {z}
+define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermt2d {{.*}}{%k1} {z}
+define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+ %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2pd %xmm{{.*}}{%k1}
+define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
+ %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2pd %ymm{{.*}}{%k1}
+define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
+ %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2ps %xmm{{.*}}{%k1}
+define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
+ %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2ps %ymm{{.*}}{%k1}
+define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
+ %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsq{{.*}}{%k1}
+define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsq{{.*}}{%k1}
+define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsd{{.*}}{%k1}
+define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsd{{.*}}{%k1}
+define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+
+declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefpd{{.*}}{%k1}
+define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+ %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefpd{{.*}}{%k1}
+define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+ %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefps{{.*}}{%k1}
+define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+ %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefps{{.*}}{%k1}
+define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+ %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
} \ No newline at end of file
diff --git a/test/CodeGen/X86/coff-weak.ll b/test/CodeGen/X86/coff-weak.ll
new file mode 100644
index 0000000000000..369750147f292
--- /dev/null
+++ b/test/CodeGen/X86/coff-weak.ll
@@ -0,0 +1,9 @@
+; RUN: llc -function-sections -o - %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; CHECK: .section{{.*}}one_only
+define linkonce_odr void @foo() {
+ ret void
+}
diff --git a/test/CodeGen/X86/commute-two-addr.ll b/test/CodeGen/X86/commute-two-addr.ll
index 656c385e2bc7d..5b01e2f4e90d5 100644
--- a/test/CodeGen/X86/commute-two-addr.ll
+++ b/test/CodeGen/X86/commute-two-addr.ll
@@ -39,7 +39,7 @@ define %0 @t3(i32 %lb, i8 zeroext %has_lb, i8 zeroext %lb_inclusive, i32 %ub, i8
entry:
; DARWIN-LABEL: t3:
; DARWIN: shlq $32, %rcx
-; DARWIN-NEXT: orq %rcx, %rax
+; DARWIN-NEXT: leaq (%rax,%rcx), %rax
; DARWIN-NEXT: shll $8
; DARWIN-NOT: leaq
%tmp21 = zext i32 %lb to i64
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index 629a5572977f8..bb5e92f98c7d7 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -71,33 +71,33 @@ define weak_odr dllexport void @weak1() {
@blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*)
; CHECK: .section .drectve
-; WIN32: " /EXPORT:Var1,DATA"
-; WIN32: " /EXPORT:Var2,DATA"
-; WIN32: " /EXPORT:Var3,DATA"
-; WIN32: " /EXPORT:WeakVar1,DATA"
-; WIN32: " /EXPORT:WeakVar2,DATA"
-; WIN32: " /EXPORT:f1"
-; WIN32: " /EXPORT:f2"
-; WIN32: " /EXPORT:lnk1"
-; WIN32: " /EXPORT:lnk2"
-; WIN32: " /EXPORT:weak1"
-; WIN32: " /EXPORT:alias"
-; WIN32: " /EXPORT:alias2"
-; WIN32: " /EXPORT:alias3"
-; WIN32: " /EXPORT:weak_alias"
-; WIN32: " /EXPORT:blob_alias"
-; MINGW: " -export:Var1,data"
-; MINGW: " -export:Var2,data"
-; MINGW: " -export:Var3,data"
-; MINGW: " -export:WeakVar1,data"
-; MINGW: " -export:WeakVar2,data"
-; MINGW: " -export:f1"
-; MINGW: " -export:f2"
-; MINGW: " -export:lnk1"
-; MINGW: " -export:lnk2"
-; MINGW: " -export:weak1"
-; MINGW: " -export:alias"
-; MINGW: " -export:alias2"
-; MINGW: " -export:alias3"
-; MINGW: " -export:weak_alias"
-; MINGW: " -export:blob_alias"
+; WIN32: /EXPORT:f1
+; WIN32-SAME: /EXPORT:f2
+; WIN32-SAME: /EXPORT:lnk1
+; WIN32-SAME: /EXPORT:lnk2
+; WIN32-SAME: /EXPORT:weak1
+; WIN32-SAME: /EXPORT:Var1,DATA
+; WIN32-SAME: /EXPORT:Var2,DATA
+; WIN32-SAME: /EXPORT:Var3,DATA
+; WIN32-SAME: /EXPORT:WeakVar1,DATA
+; WIN32-SAME: /EXPORT:WeakVar2,DATA
+; WIN32-SAME: /EXPORT:alias
+; WIN32-SAME: /EXPORT:alias2
+; WIN32-SAME: /EXPORT:alias3
+; WIN32-SAME: /EXPORT:weak_alias
+; WIN32-SAME: /EXPORT:blob_alias
+; MINGW: -export:f1
+; MINGW-SAME: -export:f2
+; MINGW-SAME: -export:lnk1
+; MINGW-SAME: -export:lnk2
+; MINGW-SAME: -export:weak1
+; MINGW-SAME: -export:Var1,data
+; MINGW-SAME: -export:Var2,data
+; MINGW-SAME: -export:Var3,data
+; MINGW-SAME: -export:WeakVar1,data
+; MINGW-SAME: -export:WeakVar2,data
+; MINGW-SAME: -export:alias
+; MINGW-SAME: -export:alias2
+; MINGW-SAME: -export:alias3
+; MINGW-SAME: -export:weak_alias
+; MINGW-SAME: -export:blob_alias"
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index 02a83ae7b191d..915567de5bf77 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -89,40 +89,41 @@ define weak_odr dllexport void @weak1() {
@weak_alias = weak_odr dllexport alias void()* @f1
; CHECK: .section .drectve
-; CHECK-CL: " /EXPORT:_Var1,DATA"
-; CHECK-CL: " /EXPORT:_Var2,DATA"
-; CHECK-CL: " /EXPORT:_Var3,DATA"
-; CHECK-CL: " /EXPORT:_WeakVar1,DATA"
-; CHECK-CL: " /EXPORT:_WeakVar2,DATA"
-; CHECK-CL: " /EXPORT:_f1"
-; CHECK-CL: " /EXPORT:_f2"
; CHECK-CL-NOT: not_exported
-; CHECK-CL: " /EXPORT:_stdfun@0"
-; CHECK-CL: " /EXPORT:@fastfun@0"
-; CHECK-CL: " /EXPORT:_thisfun"
-; CHECK-CL: " /EXPORT:_lnk1"
-; CHECK-CL: " /EXPORT:_lnk2"
-; CHECK-CL: " /EXPORT:_weak1"
-; CHECK-CL: " /EXPORT:_alias"
-; CHECK-CL: " /EXPORT:_alias2"
-; CHECK-CL: " /EXPORT:_alias3"
-; CHECK-CL: " /EXPORT:_weak_alias"
-; CHECK-GCC: " -export:Var1,data"
-; CHECK-GCC: " -export:Var2,data"
-; CHECK-GCC: " -export:Var3,data"
-; CHECK-GCC: " -export:WeakVar1,data"
-; CHECK-GCC: " -export:WeakVar2,data"
-; CHECK-GCC: " -export:f1"
-; CHECK-GCC: " -export:f2"
+; CHECK-CL: /EXPORT:_f1
+; CHECK-CL-SAME: /EXPORT:_f2
+; CHECK-CL-SAME: /EXPORT:_stdfun@0
+; CHECK-CL-SAME: /EXPORT:@fastfun@0
+; CHECK-CL-SAME: /EXPORT:_thisfun
+; CHECK-CL-SAME: /EXPORT:_lnk1
+; CHECK-CL-SAME: /EXPORT:_lnk2
+; CHECK-CL-SAME: /EXPORT:_weak1
+; CHECK-CL-SAME: /EXPORT:_Var1,DATA
+; CHECK-CL-SAME: /EXPORT:_Var2,DATA
+; CHECK-CL-SAME: /EXPORT:_Var3,DATA
+; CHECK-CL-SAME: /EXPORT:_WeakVar1,DATA
+; CHECK-CL-SAME: /EXPORT:_WeakVar2,DATA
+; CHECK-CL-SAME: /EXPORT:_alias
+; CHECK-CL-SAME: /EXPORT:_alias2
+; CHECK-CL-SAME: /EXPORT:_alias3
+; CHECK-CL-SAME: /EXPORT:_weak_alias"
; CHECK-CL-NOT: not_exported
-; CHECK-GCC: " -export:stdfun@0"
-; CHECK-GCC: " -export:@fastfun@0"
-; CHECK-GCC: " -export:thisfun"
-; CHECK-GCC: " -export:lnk1"
-; CHECK-GCC: " -export:lnk2"
-; CHECK-GCC: " -export:weak1"
-; CHECK-GCC: " -export:alias"
-; CHECK-GCC: " -export:alias2"
-; CHECK-GCC: " -export:alias3"
-; CHECK-GCC: " -export:weak_alias"
-
+; CHECK-GCC-NOT: not_exported
+; CHECK-GCC: -export:f1
+; CHECK-GCC-SAME: -export:f2
+; CHECK-GCC-SAME: -export:stdfun@0
+; CHECK-GCC-SAME: -export:@fastfun@0
+; CHECK-GCC-SAME: -export:thisfun
+; CHECK-GCC-SAME: -export:lnk1
+; CHECK-GCC-SAME: -export:lnk2
+; CHECK-GCC-SAME: -export:weak1
+; CHECK-GCC-SAME: -export:Var1,data
+; CHECK-GCC-SAME: -export:Var2,data
+; CHECK-GCC-SAME: -export:Var3,data
+; CHECK-GCC-SAME: -export:WeakVar1,data
+; CHECK-GCC-SAME: -export:WeakVar2,data
+; CHECK-GCC-SAME: -export:alias
+; CHECK-GCC-SAME: -export:alias2
+; CHECK-GCC-SAME: -export:alias3
+; CHECK-GCC-SAME: -export:weak_alias"
+; CHECK-GCC-NOT: not_exported
diff --git a/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll b/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
new file mode 100644
index 0000000000000..f7d0cdf3c65a1
--- /dev/null
+++ b/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll
@@ -0,0 +1,204 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s
+
+; CHECK-LABEL: fmaddsubpd_loop:
+; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddpd_loop:
+; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddpd_loop:
+; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubpd_loop:
+; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <4 x double> %c.addr.0
+}
+
+declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+
+; CHECK-LABEL: fmaddsubps_loop:
+; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddps_loop:
+; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddps_loop:
+; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubps_loop:
+; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+ br label %for.cond
+
+for.cond:
+ %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, %iter
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+ br label %for.inc
+
+for.inc:
+ %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end:
+ ret <8 x float> %c.addr.0
+}
+
+declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
new file mode 100644
index 0000000000000..881436386bac8
--- /dev/null
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -0,0 +1,493 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+
+; VFMADD
+define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFMSUB
+define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFNMADD
+define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFNMSUB
+define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFMADDSUB
+define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFMSUBADD
+define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-FMA: # BB#0:
+; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
+; CHECK-FMA-NEXT: retq
+;
+; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-FMA4: # BB#0:
+; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/fma-intrinsics-x86_64.ll b/test/CodeGen/X86/fma-intrinsics-x86_64.ll
deleted file mode 100644
index aadd7311bb89e..0000000000000
--- a/test/CodeGen/X86/fma-intrinsics-x86_64.ll
+++ /dev/null
@@ -1,278 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
-
-; VFMADD
-define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfmaddss
- ; CHECK-FMA: vfmadd213ss
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfmaddsd
- ; CHECK-FMA: vfmadd213sd
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfmaddps
- ; CHECK-FMA: vfmadd213ps
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfmaddpd
- ; CHECK-FMA: vfmadd213pd
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK-FMA4: vfmaddps
- ; CHECK-FMA: vfmadd213ps
- ; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
- ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK-FMA4: vfmaddpd
- ; CHECK-FMA: vfmadd213pd
- ; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
- ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFMSUB
-define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfmsubss
- ; CHECK-FMA: vfmsub213ss
- %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfmsubsd
- ; CHECK-FMA: vfmsub213sd
- %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfmsubps
- ; CHECK-FMA: vfmsub213ps
- %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfmsubpd
- ; CHECK-FMA: vfmsub213pd
- %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK-FMA4: vfmsubps
- ; CHECK-FMA: vfmsub213ps
- ; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
- ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK-FMA4: vfmsubpd
- ; CHECK-FMA: vfmsub213pd
- ; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
- ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFNMADD
-define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfnmaddss
- ; CHECK-FMA: vfnmadd213ss
- %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfnmaddsd
- ; CHECK-FMA: vfnmadd213sd
- %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfnmaddps
- ; CHECK-FMA: vfnmadd213ps
- %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfnmaddpd
- ; CHECK-FMA: vfnmadd213pd
- %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK-FMA4: vfnmaddps
- ; CHECK-FMA: vfnmadd213ps
- ; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
- ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK-FMA4: vfnmaddpd
- ; CHECK-FMA: vfnmadd213pd
- ; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
- ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFNMSUB
-define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfnmsubss
- ; CHECK-FMA: vfnmsub213ss
- %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfnmsubsd
- ; CHECK-FMA: vfnmsub213sd
- %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfnmsubps
- ; CHECK-FMA: vfnmsub213ps
- %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfnmsubpd
- ; CHECK-FMA: vfnmsub213pd
- %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK-FMA4: vfnmsubps
- ; CHECK-FMA: vfnmsub213ps
- ; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
- ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK-FMA4: vfnmsubpd
- ; CHECK-FMA: vfnmsub213pd
- ; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
- ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFMADDSUB
-define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfmaddsubps
- ; CHECK-FMA: vfmaddsub213ps
- %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfmaddsubpd
- ; CHECK-FMA: vfmaddsub213pd
- %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK-FMA4: vfmaddsubps
- ; CHECK-FMA: vfmaddsub213ps
- ; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
- ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK-FMA4: vfmaddsubpd
- ; CHECK-FMA: vfmaddsub213pd
- ; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
- ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFMSUBADD
-define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
- ; CHECK-FMA4: vfmsubaddps
- ; CHECK-FMA: vfmsubadd213ps
- %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
- ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
- ; CHECK-FMA4: vfmsubaddpd
- ; CHECK-FMA: vfmsubadd213pd
- %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
- ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
- ; CHECK-FMA4: vfmsubaddps
- ; CHECK-FMA: vfmsubadd213ps
- ; CHECK: ymm
- %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
- ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
- ; CHECK-FMA4: vfmsubaddpd
- ; CHECK-FMA: vfmsubadd213pd
- ; CHECK: ymm
- %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
- ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll
index 9715bc7b328b7..34acdfe830f0e 100644
--- a/test/CodeGen/X86/fma-phi-213-to-231.ll
+++ b/test/CodeGen/X86/fma-phi-213-to-231.ll
@@ -1,246 +1,37 @@
-; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-; CHECK-LABEL: fmaddsubpd_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <4 x double> %c.addr.0
-}
-
-; CHECK-LABEL: fmsubaddpd_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <4 x double> %c.addr.0
-}
-
-; CHECK-LABEL: fmaddpd_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <4 x double> %c.addr.0
-}
-
-; CHECK-LABEL: fmsubpd_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <4 x double> %c.addr.0
-}
-
-declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
-declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
-declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
-declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-
-; CHECK-LABEL: fmaddsubps_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <8 x float> %c.addr.0
-}
-
-; CHECK-LABEL: fmsubaddps_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <8 x float> %c.addr.0
-}
-
-; CHECK-LABEL: fmaddps_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <8 x float> %c.addr.0
-}
-
-; CHECK-LABEL: fmsubps_loop
-; CHECK: [[BODYLBL:LBB.+]]:
-; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
-; CHECK: [[INCLBL:LBB.+]]:
-; CHECK: addl $1, [[INDREG:%[a-z0-9]+]]
-; CHECK: cmpl {{%.+}}, [[INDREG]]
-; CHECK: jl [[BODYLBL]]
-define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
-entry:
- br label %for.cond
-
-for.cond:
- %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
- %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
- %cmp = icmp slt i32 %i.0, %iter
- br i1 %cmp, label %for.body, label %for.end
-
-for.body:
- br label %for.inc
-
-for.inc:
- %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
- %inc = add nsw i32 %i.0, 1
- br label %for.cond
-
-for.end:
- ret <8 x float> %c.addr.0
-}
-
-declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s
+
+; Test FMA3 variant selection
+
+; CHECK-LABEL: fma3_select231ssX:
+; CHECK: vfmadd231ss %xmm
+define float @fma3_select231ssX(float %x, float %y) {
+entry:
+ br label %while.body
+while.body:
+ %acc.01 = phi float [ 0.000000e+00, %entry ], [ %acc, %while.body ]
+ %acc = call float @llvm.fma.f32(float %x, float %y, float %acc.01)
+ %b = fcmp ueq float %acc, 0.0
+ br i1 %b, label %while.body, label %while.end
+while.end:
+ ret float %acc
+}
+
+; CHECK-LABEL: fma3_select231pdY:
+; CHECK: vfmadd231pd %ymm
+define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) {
+entry:
+ br label %while.body
+while.body:
+ %acc.04 = phi <4 x double> [ zeroinitializer, %entry ], [ %add, %while.body ]
+ %add = call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %acc.04)
+ %vecext = extractelement <4 x double> %add, i32 0
+ %cmp = fcmp oeq double %vecext, 0.000000e+00
+ br i1 %cmp, label %while.body, label %while.end
+while.end:
+ ret <4 x double> %add
+}
+
+declare float @llvm.fma.f32(float, float, float)
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 2eb152b078eff..b91479cda8715 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,80 +1,47 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
-
-; CHECK: test_f32
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+avx512f,-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL
+
+; CHECK-LABEL: test_f32:
; CHECK-FMA-INST: vfmadd213ss
; CHECK-FMA-CALL: fmaf
-
-define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
+define float @test_f32(float %a, float %b, float %c) #0 {
entry:
- %call = tail call float @llvm.fma.f32(float %a, float %b, float %c) nounwind readnone
+ %call = call float @llvm.fma.f32(float %a, float %b, float %c)
ret float %call
}
-; CHECK: test_f64
+; CHECK-LABEL: test_f64:
; CHECK-FMA-INST: vfmadd213sd
; CHECK-FMA-CALL: fma
-
-define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
+define double @test_f64(double %a, double %b, double %c) #0 {
entry:
- %call = tail call double @llvm.fma.f64(double %a, double %b, double %c) nounwind readnone
+ %call = call double @llvm.fma.f64(double %a, double %b, double %c)
ret double %call
}
-; CHECK: test_f80
+; CHECK-LABEL: test_f80:
; CHECK: fmal
-
-define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
+define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
entry:
- %call = tail call x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone
+ %call = call x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c)
ret x86_fp80 %call
}
-; CHECK: test_f32_cst
-; CHECK-NOT: fma
-define float @test_f32_cst() nounwind readnone ssp {
+; CHECK-LABEL: test_f32_cst:
+; CHECK-NOT: vfmadd
+define float @test_f32_cst() #0 {
entry:
- %call = tail call float @llvm.fma.f32(float 3.0, float 3.0, float 3.0) nounwind readnone
+ %call = call float @llvm.fma.f32(float 3.0, float 3.0, float 3.0)
ret float %call
}
-; Test FMA3 variant selection
-; CHECK-FMA-INST: fma3_select231ssX:
-; CHECK-FMA-INST: vfmadd231ss %xmm
-define float @fma3_select231ssX(float %x, float %y) #0 {
-entry:
- br label %while.body
-while.body: ; preds = %while.body, %while.body
- %acc.01 = phi float [ 0.000000e+00, %entry ], [ %acc, %while.body ]
- %acc = tail call float @llvm.fma.f32(float %x, float %y, float %acc.01) nounwind readnone
- %b = fcmp ueq float %acc, 0.0
- br i1 %b, label %while.body, label %while.end
-while.end: ; preds = %while.body, %entry
- ret float %acc
-}
-
-; Test FMA3 variant selection
-; CHECK-FMA-INST: fma3_select231pdY:
-; CHECK-FMA-INST: vfmadd231pd %ymm
-define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 {
-entry:
- br label %while.body
-while.body: ; preds = %entry, %while.body
- %acc.04 = phi <4 x double> [ zeroinitializer, %entry ], [ %add, %while.body ]
- %add = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %acc.04)
- %vecext = extractelement <4 x double> %add, i32 0
- %cmp = fcmp oeq double %vecext, 0.000000e+00
- br i1 %cmp, label %while.body, label %while.end
-
-while.end: ; preds = %while.body
- ret <4 x double> %add
-}
+declare float @llvm.fma.f32(float, float, float)
+declare double @llvm.fma.f64(double, double, double)
+declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80)
-declare float @llvm.fma.f32(float, float, float) nounwind readnone
-declare double @llvm.fma.f64(double, double, double) nounwind readnone
-declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) nounwind readnone
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
deleted file mode 100755
index fa9c252f30b46..0000000000000
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ /dev/null
@@ -1,150 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
-; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
-
-define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fmadd213ss (%r8), [[XMM1]], [[XMM0]]
- %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fmadd213ps
- %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: fmadd213ps {{.*\(%r.*}}, %ymm
- %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
-
-define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fnmadd213ss (%r8), [[XMM1]], [[XMM0]]
- %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fnmadd213ps
- %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: fnmadd213ps {{.*\(%r.*}}, %ymm
- %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fmsub213ss (%r8), [[XMM1]], [[XMM0]]
- %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fmsub213ps
- %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fnmsub213ss (%r8), [[XMM1]], [[XMM0]]
- %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fnmsub213ps
- %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-;;;;
-
-define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fmadd213sd (%r8), [[XMM1]], [[XMM0]]
- %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fmadd213pd
- %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fnmadd213sd (%r8), [[XMM1]], [[XMM0]]
- %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fnmadd213pd
- %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-
-
-define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fmsub213sd (%r8), [[XMM1]], [[XMM0]]
- %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fmsub213pd
- %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK-DAG: vmovaps (%rcx), [[XMM1:%xmm[0-9]+]]
- ; CHECK-DAG: vmovaps (%rdx), [[XMM0:%xmm[0-9]+]]
- ; CHECK: fnmsub213sd (%r8), [[XMM1]], [[XMM0]]
- %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fnmsub213pd
- %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/fold-load-binops.ll b/test/CodeGen/X86/fold-load-binops.ll
new file mode 100644
index 0000000000000..6d501c74fe57b
--- /dev/null
+++ b/test/CodeGen/X86/fold-load-binops.ll
@@ -0,0 +1,142 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+
+; Verify that we're folding the load into the math instruction.
+; This pattern is generated out of the simplest intrinsics usage:
+; _mm_add_ss(a, _mm_load_ss(b));
+
+define <4 x float> @addss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: addss:
+; SSE: # BB#0:
+; SSE-NEXT: addss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: addss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fadd float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @addsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: addsd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: addsd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fadd double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}
+
+define <4 x float> @subss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: subss:
+; SSE: # BB#0:
+; SSE-NEXT: subss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: subss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fsub float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @subsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: subsd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: subsd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fsub double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}
+
+define <4 x float> @mulss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: mulss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mulss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fmul float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @mulsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: mulsd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mulsd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fmul double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}
+
+define <4 x float> @divss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: divss:
+; SSE: # BB#0:
+; SSE-NEXT: divss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: divss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <4 x float> %va, i32 0
+ %b = load float, float* %pb
+ %r = fdiv float %a, %b
+ %vr = insertelement <4 x float> %va, float %r, i32 0
+ ret <4 x float> %vr
+}
+
+define <2 x double> @divsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: divsd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: divsd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = extractelement <2 x double> %va, i32 0
+ %b = load double, double* %pb
+ %r = fdiv double %a, %b
+ %vr = insertelement <2 x double> %va, double %r, i32 0
+ ret <2 x double> %vr
+}
diff --git a/test/CodeGen/X86/fold-vector-sext-crash2.ll b/test/CodeGen/X86/fold-vector-sext-crash2.ll
new file mode 100644
index 0000000000000..44c836195abc2
--- /dev/null
+++ b/test/CodeGen/X86/fold-vector-sext-crash2.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -march=x86-64 | FileCheck %s -check-prefix=X64
+
+; DAGCombiner crashes during sext folding
+
+define <2 x i256> @test_sext1() {
+ %Se = sext <2 x i8> <i8 -100, i8 -99> to <2 x i256>
+ %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
+ ret <2 x i256> %Shuff
+
+ ; X64-LABEL: test_sext1
+ ; X64: movq $-1
+ ; X64-NEXT: movq $-1
+ ; X64-NEXT: movq $-1
+ ; X64-NEXT: movq $-99
+
+ ; X32-LABEL: test_sext1
+ ; X32: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-99
+}
+
+define <2 x i256> @test_sext2() {
+ %Se = sext <2 x i128> <i128 -2000, i128 -1999> to <2 x i256>
+ %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
+ ret <2 x i256> %Shuff
+
+ ; X64-LABEL: test_sext2
+ ; X64: movq $-1
+ ; X64-NEXT: movq $-1
+ ; X64-NEXT: movq $-1
+ ; X64-NEXT: movq $-1999
+
+ ; X32-LABEL: test_sext2
+ ; X32: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1999
+}
+
+define <2 x i256> @test_zext1() {
+ %Se = zext <2 x i8> <i8 -1, i8 -2> to <2 x i256>
+ %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
+ ret <2 x i256> %Shuff
+
+ ; X64-LABEL: test_zext1
+ ; X64: movq $0
+ ; X64-NEXT: movq $0
+ ; X64-NEXT: movq $0
+ ; X64-NEXT: movq $254
+
+ ; X32-LABEL: test_zext1
+ ; X32: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $254
+}
+
+define <2 x i256> @test_zext2() {
+ %Se = zext <2 x i128> <i128 -1, i128 -2> to <2 x i256>
+ %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
+ ret <2 x i256> %Shuff
+
+ ; X64-LABEL: test_zext2
+ ; X64: movq $0
+ ; X64-NEXT: movq $0
+ ; X64-NEXT: movq $-1
+ ; X64-NEXT: movq $-2
+
+ ; X32-LABEL: test_zext2
+ ; X32: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $0
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-1
+ ; X32-NEXT: movl $-2
+}
diff --git a/test/CodeGen/X86/fold-vector-shl-crash.ll b/test/CodeGen/X86/fold-vector-shl-crash.ll
new file mode 100644
index 0000000000000..9f81e44074f1d
--- /dev/null
+++ b/test/CodeGen/X86/fold-vector-shl-crash.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+;CHECK-LABEL: test
+define <2 x i256> @test() {
+ %S = shufflevector <2 x i256> zeroinitializer, <2 x i256> <i256 -1, i256 -1>, <2 x i32> <i32 0, i32 2>
+ %B = shl <2 x i256> %S, <i256 -1, i256 -1> ; DAG Combiner crashes here
+ ret <2 x i256> %B
+}
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
index 4f503af716a80..27af5738ca3e8 100644
--- a/test/CodeGen/X86/fp-fast.ll
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -114,81 +114,3 @@ define float @test11(float %a) {
ret float %t2
}
-; Verify that the first two adds are independent regardless of how the inputs are
-; commuted. The destination registers are used as source registers for the third add.
-
-define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %t0, %x2
- %t2 = fadd float %t1, %x3
- ret float %t2
-}
-
-define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %x2, %t0
- %t2 = fadd float %t1, %x3
- ret float %t2
-}
-
-define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %t0, %x2
- %t2 = fadd float %x3, %t1
- ret float %t2
-}
-
-define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %x2, %t0
- %t2 = fadd float %x3, %t1
- ret float %t2
-}
-
-; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
-; produced because that would cost more compile time.
-
-define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK: # BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddss %xmm7, %xmm6, %xmm1
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %t0 = fadd float %x0, %x1
- %t1 = fadd float %t0, %x2
- %t2 = fadd float %t1, %x3
- %t3 = fadd float %t2, %x4
- %t4 = fadd float %t3, %x5
- %t5 = fadd float %t4, %x6
- %t6 = fadd float %t5, %x7
- ret float %t6
-}
diff --git a/test/CodeGen/X86/implicit-null-check-negative.ll b/test/CodeGen/X86/implicit-null-check-negative.ll
index e0210d9315f14..8fbed9f7bee85 100644
--- a/test/CodeGen/X86/implicit-null-check-negative.ll
+++ b/test/CodeGen/X86/implicit-null-check-negative.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-apple-macosx -O3 -debug-only=faultmaps -enable-implicit-null-checks < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx -O3 -debug-only=faultmaps -enable-implicit-null-checks < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
; List cases where we should *not* be emitting implicit null checks.
@@ -10,7 +10,7 @@ define i32 @imp_null_check_load(i32* %x, i32* %y) {
%c = icmp eq i32* %x, null
; It isn't legal to move the load from %x from "not_null" to here --
; the store to %y could be aliasing it.
- br i1 %c, label %is_null, label %not_null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
is_null:
ret i32 42
@@ -24,7 +24,7 @@ define i32 @imp_null_check_load(i32* %x, i32* %y) {
define i32 @imp_null_check_gep_load(i32* %x) {
entry:
%c = icmp eq i32* %x, null
- br i1 %c, label %is_null, label %not_null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
is_null:
ret i32 42
@@ -38,8 +38,7 @@ define i32 @imp_null_check_gep_load(i32* %x) {
}
define i32 @imp_null_check_load_no_md(i32* %x) {
-; Everything is okay except that the !never.executed metadata is
-; missing.
+; This is fine, except it is missing the !make.implicit metadata.
entry:
%c = icmp eq i32* %x, null
br i1 %c, label %is_null, label %not_null
@@ -51,3 +50,5 @@ define i32 @imp_null_check_load_no_md(i32* %x) {
%t = load i32, i32* %x
ret i32 %t
}
+
+!0 = !{}
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index f4c539800fbbf..1d1b36bbd5d06 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -1,5 +1,15 @@
; RUN: llc -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-implicit-null-checks \
+; RUN: | llvm-mc -triple x86_64-apple-macosx -filetype=obj -o - \
+; RUN: | llvm-objdump -triple x86_64-apple-macosx -fault-map-section - \
+; RUN: | FileCheck %s -check-prefix OBJDUMP
+
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -enable-implicit-null-checks \
+; RUN: | llvm-mc -triple x86_64-unknown-linux-gnu -filetype=obj -o - \
+; RUN: | llvm-objdump -triple x86_64-unknown-linux-gnu -fault-map-section - \
+; RUN: | FileCheck %s -check-prefix OBJDUMP
+
define i32 @imp_null_check_load(i32* %x) {
; CHECK-LABEL: _imp_null_check_load:
; CHECK: Ltmp1:
@@ -11,7 +21,7 @@ define i32 @imp_null_check_load(i32* %x) {
entry:
%c = icmp eq i32* %x, null
- br i1 %c, label %is_null, label %not_null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
is_null:
ret i32 42
@@ -32,7 +42,7 @@ define i32 @imp_null_check_gep_load(i32* %x) {
entry:
%c = icmp eq i32* %x, null
- br i1 %c, label %is_null, label %not_null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
is_null:
ret i32 42
@@ -55,7 +65,7 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
entry:
%c = icmp eq i32* %x, null
- br i1 %c, label %is_null, label %not_null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
is_null:
ret i32 42
@@ -66,6 +76,8 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
ret i32 %p1
}
+!0 = !{}
+
; CHECK-LABEL: __LLVM_FaultMaps:
; Version:
@@ -116,3 +128,13 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
; CHECK-NEXT: .long Ltmp1-_imp_null_check_load
; Fault[0].HandlerOffset:
; CHECK-NEXT: .long Ltmp0-_imp_null_check_load
+
+; OBJDUMP: FaultMap table:
+; OBJDUMP-NEXT: Version: 0x1
+; OBJDUMP-NEXT: NumFunctions: 3
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 3
diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll
new file mode 100644
index 0000000000000..d4cd59ffac3ac
--- /dev/null
+++ b/test/CodeGen/X86/machine-combiner.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+
+; Verify that the first two adds are independent regardless of how the inputs are
+; commuted. The destination registers are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %t0, %x2
+ %t2 = fadd float %t1, %x3
+ ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %x2, %t0
+ %t2 = fadd float %t1, %x3
+ ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %t0, %x2
+ %t2 = fadd float %x3, %t1
+ ret float %t2
+}
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %x2, %t0
+ %t2 = fadd float %x3, %t1
+ ret float %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds5:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1
+; CHECK-NEXT: vaddss %xmm6, %xmm1, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm7, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fadd float %x0, %x1
+ %t1 = fadd float %t0, %x2
+ %t2 = fadd float %t1, %x3
+ %t3 = fadd float %t2, %x4
+ %t4 = fadd float %t3, %x5
+ %t5 = fadd float %t4, %x6
+ %t6 = fadd float %t5, %x7
+ ret float %t6
+}
+
+; Verify that we only need two associative operations to reassociate the operands.
+; Also, we should reassociate such that the result of the high latency division
+; is used by the final 'add' rather than reassociating the %x3 operand with the
+; division. The latter reassociation would not improve anything.
+
+define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds6:
+; CHECK: # BB#0:
+; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %t0 = fdiv float %x0, %x1
+ %t1 = fadd float %x2, %t0
+ %t2 = fadd float %x3, %t1
+ ret float %t2
+}
+
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index f89e52457f355..b02f9ec45e7fb 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -2,11 +2,15 @@
; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
+%class.Class = type { i32 }
+%struct.s = type { i64 }
+
declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
+declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32 %b, i32 %c, i32 %d)
declare void @oneparam(i32 %a)
declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
-
+declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
; Here, we should have a reserved frame, so we don't expect pushes
; NORMAL-LABEL: test1:
@@ -108,13 +112,12 @@ entry:
ret void
}
-; We don't support weird calling conventions
+; We support weird calling conventions
; NORMAL-LABEL: test4:
-; NORMAL: subl $12, %esp
-; NORMAL-NEXT: movl $4, 8(%esp)
-; NORMAL-NEXT: movl $3, 4(%esp)
-; NORMAL-NEXT: movl $1, (%esp)
-; NORMAL-NEXT: movl $2, %eax
+; NORMAL: movl $2, %eax
+; NORMAL-NEXT: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $12, %esp
define void @test4() optsize {
@@ -123,6 +126,20 @@ entry:
ret void
}
+; NORMAL-LABEL: test4b:
+; NORMAL: movl 4(%esp), %ecx
+; NORMAL-NEXT: pushl $4
+; NORMAL-NEXT: pushl $3
+; NORMAL-NEXT: pushl $2
+; NORMAL-NEXT: pushl $1
+; NORMAL-NEXT: call
+; NORMAL-NEXT: ret
+define void @test4b(%class.Class* %f) optsize {
+entry:
+ call x86_thiscallcc void @thiscall(%class.Class* %f, i32 1, i32 2, i32 3, i32 4)
+ ret void
+}
+
; When there is no reserved call frame, check that additional alignment
; is added when the pushes don't add up to the required alignment.
; ALIGNED-LABEL: test5:
@@ -229,20 +246,27 @@ entry:
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $16, %esp
-; NORMAL-NEXT: subl $16, %esp
-; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]]
-; NORMAL-NEXT: movl [[EAX]], 12(%esp)
-; NORMAL-NEXT: movl $7, 8(%esp)
-; NORMAL-NEXT: movl $6, 4(%esp)
-; NORMAL-NEXT: movl $5, (%esp)
+; NORMAL-NEXT: subl $20, %esp
+; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]]
+; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]]
+; NORMAL-NEXT: movl [[E2]], 4(%esp)
+; NORMAL-NEXT: movl [[E1]], (%esp)
+; NORMAL-NEXT: leal 32(%esp), [[E3:%e..]]
+; NORMAL-NEXT: movl [[E3]], 16(%esp)
+; NORMAL-NEXT: leal 28(%esp), [[E4:%e..]]
+; NORMAL-NEXT: movl [[E4]], 12(%esp)
+; NORMAL-NEXT: movl $6, 8(%esp)
; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: addl $20, %esp
define void @test9() optsize {
entry:
%p = alloca i32, align 4
+ %q = alloca i32, align 4
+ %s = alloca %struct.s, align 4
call void @good(i32 1, i32 2, i32 3, i32 4)
- %0 = ptrtoint i32* %p to i32
- call void @good(i32 5, i32 6, i32 7, i32 %0)
+ %pv = ptrtoint i32* %p to i32
+ %qv = ptrtoint i32* %q to i32
+ call void @struct(%struct.s* byval %s, i32 6, i32 %qv, i32 %pv)
ret void
}
@@ -291,28 +315,17 @@ define void @test11() optsize {
; Converting one mov into a push isn't worth it when
; doing so forces too much overhead for other calls.
; NORMAL-LABEL: test12:
-; NORMAL: subl $16, %esp
-; NORMAL-NEXT: movl $4, 8(%esp)
-; NORMAL-NEXT: movl $3, 4(%esp)
-; NORMAL-NEXT: movl $1, (%esp)
-; NORMAL-NEXT: movl $2, %eax
-; NORMAL-NEXT: calll _inreg
-; NORMAL-NEXT: movl $8, 12(%esp)
+; NORMAL: movl $8, 12(%esp)
; NORMAL-NEXT: movl $7, 8(%esp)
; NORMAL-NEXT: movl $6, 4(%esp)
; NORMAL-NEXT: movl $5, (%esp)
; NORMAL-NEXT: calll _good
-; NORMAL-NEXT: movl $12, 8(%esp)
-; NORMAL-NEXT: movl $11, 4(%esp)
-; NORMAL-NEXT: movl $9, (%esp)
-; NORMAL-NEXT: movl $10, %eax
-; NORMAL-NEXT: calll _inreg
-; NORMAL-NEXT: addl $16, %esp
define void @test12() optsize {
entry:
- call void @inreg(i32 1, i32 2, i32 3, i32 4)
+ %s = alloca %struct.s, align 4
+ call void @struct(%struct.s* %s, i32 2, i32 3, i32 4)
call void @good(i32 5, i32 6, i32 7, i32 8)
- call void @inreg(i32 9, i32 10, i32 11, i32 12)
+ call void @struct(%struct.s* %s, i32 10, i32 11, i32 12)
ret void
}
@@ -324,13 +337,12 @@ entry:
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: calll _good
; NORMAL-NEXT: addl $16, %esp
-; NORMAL-NEXT: subl $12, %esp
-; NORMAL-NEXT: movl $8, 8(%esp)
-; NORMAL-NEXT: movl $7, 4(%esp)
-; NORMAL-NEXT: movl $5, (%esp)
-; NORMAL-NEXT: movl $6, %eax
-; NORMAL-NEXT: calll _inreg
-; NORMAL-NEXT: addl $12, %esp
+; NORMAL-NEXT: subl $20, %esp
+; NORMAL: movl $8, 16(%esp)
+; NORMAL-NEXT: movl $7, 12(%esp)
+; NORMAL-NEXT: movl $6, 8(%esp)
+; NORMAL-NEXT: calll _struct
+; NORMAL-NEXT: addl $20, %esp
; NORMAL-NEXT: pushl $12
; NORMAL-NEXT: pushl $11
; NORMAL-NEXT: pushl $10
@@ -339,8 +351,9 @@ entry:
; NORMAL-NEXT: addl $16, %esp
define void @test12b() optsize {
entry:
- call void @good(i32 1, i32 2, i32 3, i32 4)
- call void @inreg(i32 5, i32 6, i32 7, i32 8)
+ %s = alloca %struct.s, align 4
+ call void @good(i32 1, i32 2, i32 3, i32 4)
+ call void @struct(%struct.s* %s, i32 6, i32 7, i32 8)
call void @good(i32 9, i32 10, i32 11, i32 12)
ret void
}
diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll
index ae3ed3f8344a6..9db948adb4652 100644
--- a/test/CodeGen/X86/or-branch.ll
+++ b/test/CodeGen/X86/or-branch.ll
@@ -1,19 +1,28 @@
-; RUN: llc < %s -march=x86 | not grep set
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1
define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
+; JUMP2-LABEL: foo:
+; JUMP2-DAG: jl
+; JUMP2-DAG: je
+;
+; JUMP1-LABEL: foo:
+; JUMP1-DAG: sete
+; JUMP1-DAG: setl
+; JUMP1: orb
+; JUMP1: jne
entry:
- %tmp = tail call i32 (...) @bar( ) ; <i32> [#uses=0]
- %tmp.upgrd.1 = icmp eq i32 %X, 0 ; <i1> [#uses=1]
- %tmp3 = icmp slt i32 %Y, 5 ; <i1> [#uses=1]
- %tmp4 = or i1 %tmp3, %tmp.upgrd.1 ; <i1> [#uses=1]
- br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock
+ %tmp1 = icmp eq i32 %X, 0
+ %tmp3 = icmp slt i32 %Y, 5
+ %tmp4 = or i1 %tmp3, %tmp1
+ br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock
-cond_true: ; preds = %entry
- %tmp5 = tail call i32 (...) @bar( ) ; <i32> [#uses=0]
- ret void
+cond_true:
+ %tmp5 = tail call i32 (...) @bar( )
+ ret void
-UnifiedReturnBlock: ; preds = %entry
- ret void
+UnifiedReturnBlock:
+ ret void
}
declare i32 @bar(...)
diff --git a/test/CodeGen/X86/pr23900.ll b/test/CodeGen/X86/pr23900.ll
new file mode 100644
index 0000000000000..cbc77161c0428
--- /dev/null
+++ b/test/CodeGen/X86/pr23900.ll
@@ -0,0 +1,29 @@
+; RUN: llc -filetype=obj %s -o %t.o
+; RUN: llvm-nm %t.o | FileCheck %s
+
+; Test that it doesn't crash (and produces an object file).
+; This use to pass a symbol with a null name to code that expected a valid
+; C string.
+
+; CHECK: U __CxxFrameHandler3
+; CHECK: T f
+; CHECK: t f.cleanup
+; CHECK: U g
+; CHECK: U h
+
+
+target triple = "x86_64-pc-windows-msvc18.0.0"
+define void @f(i32 %x) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+ invoke void @h()
+ to label %invoke.cont unwind label %lpad
+invoke.cont:
+ ret void
+lpad:
+ landingpad { i8*, i32 }
+ cleanup
+ call void @g(i32 %x)
+ ret void
+}
+declare void @h()
+declare i32 @__CxxFrameHandler3(...)
+declare void @g(i32 %x)
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 7f1521a83bcfd..8e02dad9d5aee 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 -recip=!divf,!vec-divf | FileCheck %s --check-prefix=NORECIP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE
@@ -14,11 +14,11 @@ define float @reciprocal_estimate(float %x) #0 {
%div = fdiv fast float 1.0, %x
ret float %div
-; CHECK-LABEL: reciprocal_estimate:
-; CHECK: movss
-; CHECK-NEXT: divss
-; CHECK-NEXT: movaps
-; CHECK-NEXT: retq
+; NORECIP-LABEL: reciprocal_estimate:
+; NORECIP: movss
+; NORECIP-NEXT: divss
+; NORECIP-NEXT: movaps
+; NORECIP-NEXT: retq
; RECIP-LABEL: reciprocal_estimate:
; RECIP: vrcpss
@@ -45,11 +45,11 @@ define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
-; CHECK-LABEL: reciprocal_estimate_v4f32:
-; CHECK: movaps
-; CHECK-NEXT: divps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: retq
+; NORECIP-LABEL: reciprocal_estimate_v4f32:
+; NORECIP: movaps
+; NORECIP-NEXT: divps
+; NORECIP-NEXT: movaps
+; NORECIP-NEXT: retq
; RECIP-LABEL: reciprocal_estimate_v4f32:
; RECIP: vrcpps
@@ -76,14 +76,14 @@ define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
-; CHECK-LABEL: reciprocal_estimate_v8f32:
-; CHECK: movaps
-; CHECK: movaps
-; CHECK-NEXT: divps
-; CHECK-NEXT: divps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: retq
+; NORECIP-LABEL: reciprocal_estimate_v8f32:
+; NORECIP: movaps
+; NORECIP: movaps
+; NORECIP-NEXT: divps
+; NORECIP-NEXT: divps
+; NORECIP-NEXT: movaps
+; NORECIP-NEXT: movaps
+; NORECIP-NEXT: retq
; RECIP-LABEL: reciprocal_estimate_v8f32:
; RECIP: vrcpps
diff --git a/test/CodeGen/X86/rrlist-livereg-corrutpion.ll b/test/CodeGen/X86/rrlist-livereg-corrutpion.ll
new file mode 100644
index 0000000000000..7191e0453a668
--- /dev/null
+++ b/test/CodeGen/X86/rrlist-livereg-corrutpion.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+; CHECK-LABEL: test
+define i64 @test(i64 %a, i256 %b, i1 %c) {
+ %u = zext i64 %a to i256
+ %s = add i256 %u, 1
+ %o = trunc i256 %s to i1
+ %j = add i256 %s, 1
+ %i = icmp ule i64 %a, 1
+ %f = select i1 %o, i256 undef, i256 %j
+ %d = select i1 %i, i256 %f, i256 1
+ %e = add i256 %b, 1
+ %n = select i1 %c, i256 %e, i256 %b
+ %m = trunc i256 %n to i64
+ %h = add i64 %m, 1
+ %r = zext i64 %h to i256
+ %v = lshr i256 %d, %r
+ %t = trunc i256 %v to i1
+ %q = shl i256 1, %r
+ %p = and i256 %d, %q
+ %w = icmp ule i256 %n, 1
+ %y = select i1 %t, i256 undef, i256 %p
+ %x = select i1 %w, i256 %y, i256 %d
+ %z = trunc i256 %x to i64
+ ret i64 %z
+}
diff --git a/test/CodeGen/X86/sdiv-exact.ll b/test/CodeGen/X86/sdiv-exact.ll
index 4f8d3f05351b2..a6ace5bc31c1a 100644
--- a/test/CodeGen/X86/sdiv-exact.ll
+++ b/test/CodeGen/X86/sdiv-exact.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -mattr=+sse2 < %s | FileCheck %s
define i32 @test1(i32 %x) {
%div = sdiv exact i32 %x, 25
@@ -16,3 +16,14 @@ define i32 @test2(i32 %x) {
; CHECK-NEXT: imull $-1431655765
; CHECK-NEXT: ret
}
+
+define <4 x i32> @test3(<4 x i32> %x) {
+ %div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 24, i32 24>
+ ret <4 x i32> %div
+; CHECK-LABEL: test3:
+; CHECK: psrad $3,
+; CHECK: pmuludq
+; CHECK: pmuludq
+; CHECK-NOT: psrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll
index 28b0bca962ea8..423b9914e99d2 100644
--- a/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.eh.typeid.for(i8*)
declare i8* @llvm.frameaddress(i32)
declare i8* @llvm.framerecover(i8*, i8*, i32)
declare void @llvm.frameescape(...)
-declare i8* @llvm.x86.seh.exceptioninfo(i8*, i8*)
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
define i32 @main() personality i8* bitcast (i32 (...)* @_except_handler3 to i8*) {
entry:
@@ -43,14 +43,16 @@ eh.resume: ; preds = %lpad
define internal i32 @"filt$main"() {
entry:
- %0 = tail call i8* @llvm.frameaddress(i32 1)
- %1 = tail call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %0, i32 0)
- %__exceptioncode = bitcast i8* %1 to i32*
- %2 = tail call i8* @llvm.x86.seh.exceptioninfo(i8* bitcast (i32 ()* @main to i8*), i8* %0)
- %3 = bitcast i8* %2 to i32**
- %4 = load i32*, i32** %3, align 4
- %5 = load i32, i32* %4, align 4
- store i32 %5, i32* %__exceptioncode, align 4
+ %ebp = tail call i8* @llvm.frameaddress(i32 1)
+ %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
+ %code.i8 = tail call i8* @llvm.framerecover(i8* bitcast (i32 ()* @main to i8*), i8* %parentfp, i32 0)
+ %__exceptioncode = bitcast i8* %code.i8 to i32*
+ %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
+ %0 = bitcast i8* %info.addr to i32***
+ %1 = load i32**, i32*** %0, align 4
+ %2 = load i32*, i32** %1, align 4
+ %3 = load i32, i32* %2, align 4
+ store i32 %3, i32* %__exceptioncode, align 4
ret i32 1
}
@@ -76,10 +78,17 @@ entry:
; CHECK: calll _printf
; CHECK: .section .xdata,"dr"
+; CHECK: Lmain$parent_frame_offset = Lmain$frame_escape_1
; CHECK: L__ehtable$main
; CHECK-NEXT: .long -1
; CHECK-NEXT: .long _filt$main
; CHECK-NEXT: .long Ltmp{{[0-9]+}}
; CHECK-LABEL: _filt$main:
-; CHECK: movl
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl (%ebp), %[[oldebp:[a-z]+]]
+; CHECK: movl -20(%[[oldebp]]), %[[ehinfo:[a-z]+]]
+; CHECK: movl (%[[ehinfo]]), %[[ehrec:[a-z]+]]
+; CHECK: movl (%[[ehrec]]), %[[ehcode:[a-z]+]]
+; CHECK: movl %[[ehcode]], {{.*}}(%{{.*}})
diff --git a/test/CodeGen/X86/seh-filter-no-personality.ll b/test/CodeGen/X86/seh-filter-no-personality.ll
new file mode 100644
index 0000000000000..87bc9c93f4004
--- /dev/null
+++ b/test/CodeGen/X86/seh-filter-no-personality.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+
+; Mostly make sure that llvm.x86.seh.recoverfp doesn't crash if the parent
+; function lacks a personality.
+
+declare i8* @llvm.frameaddress(i32)
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
+
+define i32 @main() {
+entry:
+ ret i32 0
+}
+
+define internal i32 @"filt$main"() {
+entry:
+ %ebp = tail call i8* @llvm.frameaddress(i32 1)
+ %parentfp = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (i32 ()* @main to i8*), i8* %ebp)
+ %info.addr = getelementptr inbounds i8, i8* %ebp, i32 -20
+ %0 = bitcast i8* %info.addr to i32***
+ %1 = load i32**, i32*** %0, align 4
+ %2 = load i32*, i32** %1, align 4
+ %3 = load i32, i32* %2, align 4
+ %matches = icmp eq i32 %3, u0xC0000005
+ %r = zext i1 %matches to i32
+ ret i32 %r
+}
+
+; CHECK: _main:
+; CHECK: xorl %eax, %eax
+; CHECK: retl
+
+; CHECK: _filt$main:
+; CHECK: retl
diff --git a/test/CodeGen/X86/seh-safe-div-win32.ll b/test/CodeGen/X86/seh-safe-div-win32.ll
index 0f76ec07a6b61..b1bcde2c7ff3b 100644
--- a/test/CodeGen/X86/seh-safe-div-win32.ll
+++ b/test/CodeGen/X86/seh-safe-div-win32.ll
@@ -122,27 +122,30 @@ entry:
; ...
; } EXCEPTION_RECORD;
-; FIXME: Use llvm.eh.exceptioninfo for this.
-declare i32 @safe_div_filt0()
-declare i32 @safe_div_filt1()
-; define i32 @safe_div_filt0() {
-; %eh_ptrs_c = bitcast i8* %eh_ptrs to i32**
-; %eh_rec = load i32*, i32** %eh_ptrs_c
-; %eh_code = load i32, i32* %eh_rec
-; ; EXCEPTION_ACCESS_VIOLATION = 0xC0000005
-; %cmp = icmp eq i32 %eh_code, 3221225477
-; %filt.res = zext i1 %cmp to i32
-; ret i32 %filt.res
-; }
-; define i32 @safe_div_filt1() {
-; %eh_ptrs_c = bitcast i8* %eh_ptrs to i32**
-; %eh_rec = load i32*, i32** %eh_ptrs_c
-; %eh_code = load i32, i32* %eh_rec
-; ; EXCEPTION_INT_DIVIDE_BY_ZERO = 0xC0000094
-; %cmp = icmp eq i32 %eh_code, 3221225620
-; %filt.res = zext i1 %cmp to i32
-; ret i32 %filt.res
-; }
+define i32 @safe_div_filt0() {
+ %ebp = call i8* @llvm.frameaddress(i32 1)
+ %eh_ptrs.addr.i8 = getelementptr inbounds i8, i8* %ebp, i32 -20
+ %eh_ptrs.addr = bitcast i8* %eh_ptrs.addr.i8 to i32***
+ %eh_ptrs = load i32**, i32*** %eh_ptrs.addr
+ %eh_rec = load i32*, i32** %eh_ptrs
+ %eh_code = load i32, i32* %eh_rec
+ ; EXCEPTION_ACCESS_VIOLATION = 0xC0000005
+ %cmp = icmp eq i32 %eh_code, 3221225477
+ %filt.res = zext i1 %cmp to i32
+ ret i32 %filt.res
+}
+define i32 @safe_div_filt1() {
+ %ebp = call i8* @llvm.frameaddress(i32 1)
+ %eh_ptrs.addr.i8 = getelementptr inbounds i8, i8* %ebp, i32 -20
+ %eh_ptrs.addr = bitcast i8* %eh_ptrs.addr.i8 to i32***
+ %eh_ptrs = load i32**, i32*** %eh_ptrs.addr
+ %eh_rec = load i32*, i32** %eh_ptrs
+ %eh_code = load i32, i32* %eh_rec
+ ; EXCEPTION_INT_DIVIDE_BY_ZERO = 0xC0000094
+ %cmp = icmp eq i32 %eh_code, 3221225620
+ %filt.res = zext i1 %cmp to i32
+ ret i32 %filt.res
+}
@str_result = internal constant [21 x i8] c"safe_div result: %d\0A\00"
@@ -170,3 +173,4 @@ declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind
declare void @puts(i8*)
declare void @printf(i8*, ...)
declare void @abort()
+declare i8* @llvm.frameaddress(i32)
diff --git a/test/CodeGen/X86/shift-combine.ll b/test/CodeGen/X86/shift-combine.ll
index ec62bcdcdba1b..43301041a0b69 100644
--- a/test/CodeGen/X86/shift-combine.ll
+++ b/test/CodeGen/X86/shift-combine.ll
@@ -17,3 +17,62 @@ entry:
ret i32 %tmp5
}
+define i32* @test_exact1(i32 %a, i32 %b, i32* %x) {
+; CHECK-LABEL: test_exact1:
+; CHECK: sarl %
+
+ %sub = sub i32 %b, %a
+ %shr = ashr exact i32 %sub, 3
+ %gep = getelementptr inbounds i32, i32* %x, i32 %shr
+ ret i32* %gep
+}
+
+define i32* @test_exact2(i32 %a, i32 %b, i32* %x) {
+; CHECK-LABEL: test_exact2:
+; CHECK: sarl %
+
+ %sub = sub i32 %b, %a
+ %shr = ashr exact i32 %sub, 3
+ %gep = getelementptr inbounds i32, i32* %x, i32 %shr
+ ret i32* %gep
+}
+
+define i32* @test_exact3(i32 %a, i32 %b, i32* %x) {
+; CHECK-LABEL: test_exact3:
+; CHECK-NOT: sarl
+
+ %sub = sub i32 %b, %a
+ %shr = ashr exact i32 %sub, 2
+ %gep = getelementptr inbounds i32, i32* %x, i32 %shr
+ ret i32* %gep
+}
+
+define i32* @test_exact4(i32 %a, i32 %b, i32* %x) {
+; CHECK-LABEL: test_exact4:
+; CHECK: shrl %
+
+ %sub = sub i32 %b, %a
+ %shr = lshr exact i32 %sub, 3
+ %gep = getelementptr inbounds i32, i32* %x, i32 %shr
+ ret i32* %gep
+}
+
+define i32* @test_exact5(i32 %a, i32 %b, i32* %x) {
+; CHECK-LABEL: test_exact5:
+; CHECK: shrl %
+
+ %sub = sub i32 %b, %a
+ %shr = lshr exact i32 %sub, 3
+ %gep = getelementptr inbounds i32, i32* %x, i32 %shr
+ ret i32* %gep
+}
+
+define i32* @test_exact6(i32 %a, i32 %b, i32* %x) {
+; CHECK-LABEL: test_exact6:
+; CHECK-NOT: shrl
+
+ %sub = sub i32 %b, %a
+ %shr = lshr exact i32 %sub, 2
+ %gep = getelementptr inbounds i32, i32* %x, i32 %shr
+ ret i32* %gep
+}
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 373fa53c970f8..0f8d9f4d713fa 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 -recip=!sqrtf,!vec-sqrtf,!divf,!vec-divf | FileCheck %s --check-prefix=NORECIP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE
declare double @__sqrt_finite(double) #0
@@ -10,10 +10,10 @@ declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
define double @fd(double %d) #0 {
-; CHECK-LABEL: fd:
-; CHECK: # BB#0:
-; CHECK-NEXT: sqrtsd %xmm0, %xmm0
-; CHECK-NEXT: retq
+; NORECIP-LABEL: fd:
+; NORECIP: # BB#0:
+; NORECIP-NEXT: sqrtsd %xmm0, %xmm0
+; NORECIP-NEXT: retq
;
; ESTIMATE-LABEL: fd:
; ESTIMATE: # BB#0:
@@ -25,10 +25,10 @@ define double @fd(double %d) #0 {
define float @ff(float %f) #0 {
-; CHECK-LABEL: ff:
-; CHECK: # BB#0:
-; CHECK-NEXT: sqrtss %xmm0, %xmm0
-; CHECK-NEXT: retq
+; NORECIP-LABEL: ff:
+; NORECIP: # BB#0:
+; NORECIP-NEXT: sqrtss %xmm0, %xmm0
+; NORECIP-NEXT: retq
;
; ESTIMATE-LABEL: ff:
; ESTIMATE: # BB#0:
@@ -49,11 +49,11 @@ define float @ff(float %f) #0 {
define x86_fp80 @fld(x86_fp80 %ld) #0 {
-; CHECK-LABEL: fld:
-; CHECK: # BB#0:
-; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
-; CHECK-NEXT: fsqrt
-; CHECK-NEXT: retq
+; NORECIP-LABEL: fld:
+; NORECIP: # BB#0:
+; NORECIP-NEXT: fldt {{[0-9]+}}(%rsp)
+; NORECIP-NEXT: fsqrt
+; NORECIP-NEXT: retq
;
; ESTIMATE-LABEL: fld:
; ESTIMATE: # BB#0:
@@ -67,12 +67,12 @@ define x86_fp80 @fld(x86_fp80 %ld) #0 {
define float @reciprocal_square_root(float %x) #0 {
-; CHECK-LABEL: reciprocal_square_root:
-; CHECK: # BB#0:
-; CHECK-NEXT: sqrtss %xmm0, %xmm1
-; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: divss %xmm1, %xmm0
-; CHECK-NEXT: retq
+; NORECIP-LABEL: reciprocal_square_root:
+; NORECIP: # BB#0:
+; NORECIP-NEXT: sqrtss %xmm0, %xmm1
+; NORECIP-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NORECIP-NEXT: divss %xmm1, %xmm0
+; NORECIP-NEXT: retq
;
; ESTIMATE-LABEL: reciprocal_square_root:
; ESTIMATE: # BB#0:
@@ -89,12 +89,12 @@ define float @reciprocal_square_root(float %x) #0 {
}
define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
-; CHECK-LABEL: reciprocal_square_root_v4f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: sqrtps %xmm0, %xmm1
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; CHECK-NEXT: divps %xmm1, %xmm0
-; CHECK-NEXT: retq
+; NORECIP-LABEL: reciprocal_square_root_v4f32:
+; NORECIP: # BB#0:
+; NORECIP-NEXT: sqrtps %xmm0, %xmm1
+; NORECIP-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; NORECIP-NEXT: divps %xmm1, %xmm0
+; NORECIP-NEXT: retq
;
; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
; ESTIMATE: # BB#0:
@@ -111,15 +111,15 @@ define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
}
define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
-; CHECK-LABEL: reciprocal_square_root_v8f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: sqrtps %xmm1, %xmm2
-; CHECK-NEXT: sqrtps %xmm0, %xmm3
-; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; CHECK-NEXT: movaps %xmm1, %xmm0
-; CHECK-NEXT: divps %xmm3, %xmm0
-; CHECK-NEXT: divps %xmm2, %xmm1
-; CHECK-NEXT: retq
+; NORECIP-LABEL: reciprocal_square_root_v8f32:
+; NORECIP: # BB#0:
+; NORECIP-NEXT: sqrtps %xmm1, %xmm2
+; NORECIP-NEXT: sqrtps %xmm0, %xmm3
+; NORECIP-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; NORECIP-NEXT: movaps %xmm1, %xmm0
+; NORECIP-NEXT: divps %xmm3, %xmm0
+; NORECIP-NEXT: divps %xmm2, %xmm1
+; NORECIP-NEXT: retq
;
; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
; ESTIMATE: # BB#0:
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 95f0c3d3a188b..63acf5f4f96f4 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -314,7 +314,13 @@ define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
}
declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
-; TODO stack_fold_cvtsd2ss
+define float @stack_fold_cvtsd2ss(double %a0) optsize {
+ ;CHECK-LABEL: stack_fold_cvtsd2ss
+ ;CHECK: cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fptrunc double %a0 to float
+ ret float %2
+}
define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) optsize {
;CHECK-LABEL: stack_fold_cvtsd2ss_int
diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
index e930d244638a8..a164fbbc7a6ae 100644
--- a/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -867,9 +867,21 @@ define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) {
ret <8 x i32> %2
}
-; TODO stack_fold_pshufhw
+define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) {
+ ;CHECK-LABEL: stack_fold_vpshufhw
+ ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
+ ret <16 x i16> %2
+}
-; TODO stack_fold_pshuflw
+define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) {
+ ;CHECK-LABEL: stack_fold_vpshuflw
+ ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %2
+}
define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_psignb
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index 6bb0d8980e5bd..e18476cee53c5 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
+; RUN: llc < %s -mtriple="x86_64-pc-win64-coff" | FileCheck %s
+
; This test is a sanity check to ensure statepoints are generating StackMap
; sections correctly. This is not intended to be a rigorous test of the
; StackMap format (see the stackmap tests for that).
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
declare zeroext i1 @return_i1()
diff --git a/test/CodeGen/X86/system-intrinsics-64.ll b/test/CodeGen/X86/system-intrinsics-64.ll
new file mode 100644
index 0000000000000..96c4417733902
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics-64.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define void @test_fxsave(i8* %ptr) {
+; CHECK-LABEL: test_fxsave
+; CHECK: fxsave
+ call void @llvm.x86.fxsave(i8* %ptr)
+ ret void;
+}
+declare void @llvm.x86.fxsave(i8*)
+
+define void @test_fxsave64(i8* %ptr) {
+; CHECK-LABEL: test_fxsave64
+; CHECK: fxsave64
+ call void @llvm.x86.fxsave64(i8* %ptr)
+ ret void;
+}
+declare void @llvm.x86.fxsave64(i8*)
+
+define void @test_fxrstor(i8* %ptr) {
+; CHECK-LABEL: test_fxrstor
+; CHECK: fxrstor
+ call void @llvm.x86.fxrstor(i8* %ptr)
+ ret void;
+}
+declare void @llvm.x86.fxrstor(i8*)
+
+define void @test_fxrstor64(i8* %ptr) {
+; CHECK-LABEL: test_fxrstor64
+; CHECK: fxrstor64
+ call void @llvm.x86.fxrstor64(i8* %ptr)
+ ret void;
+}
+declare void @llvm.x86.fxrstor64(i8*)
diff --git a/test/CodeGen/X86/system-intrinsics.ll b/test/CodeGen/X86/system-intrinsics.ll
new file mode 100644
index 0000000000000..84fcd052d7dbf
--- /dev/null
+++ b/test/CodeGen/X86/system-intrinsics.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
+
+define void @test_fxsave(i8* %ptr) {
+; CHECK-LABEL: test_fxsave
+; CHECK: fxsave
+ call void @llvm.x86.fxsave(i8* %ptr)
+ ret void;
+}
+declare void @llvm.x86.fxsave(i8*)
+
+define void @test_fxrstor(i8* %ptr) {
+; CHECK-LABEL: test_fxrstor
+; CHECK: fxrstor
+ call void @llvm.x86.fxrstor(i8* %ptr)
+ ret void;
+}
+declare void @llvm.x86.fxrstor(i8*)
diff --git a/test/CodeGen/X86/twoaddr-lea.ll b/test/CodeGen/X86/twoaddr-lea.ll
index b5ca0275d8d69..5779cf33ac84c 100644
--- a/test/CodeGen/X86/twoaddr-lea.ll
+++ b/test/CodeGen/X86/twoaddr-lea.ll
@@ -25,8 +25,7 @@ define i32 @test2(i32 inreg %a, i32 inreg %b, i32 %c, i32 %d) nounwind {
entry:
; CHECK-LABEL: test2:
; CHECK: leal
-; CHECK-NOT: leal
-; CHECK-NOT: mov
+; CHECK-NEXT: addl
; CHECK-NEXT: addl
; CHECK-NEXT: ret
%add = add i32 %b, %a
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 8dded07af7d4d..ca8be65075b90 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -50,31 +50,15 @@ define <2 x double> @sitofp_2vf64_i32(<4 x i32> %a) {
define <2 x double> @sitofp_2vf64_i16(<8 x i16> %a) {
; SSE2-LABEL: sitofp_2vf64_i16:
; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: movswq %ax, %rax
-; SSE2-NEXT: movd %xmm0, %rcx
-; SSE2-NEXT: movswq %cx, %rcx
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2sdq %rax, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: sitofp_2vf64_i16:
; AVX: # BB#0:
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: movswq %ax, %rax
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movswq %cx, %rcx
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0
-; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%cvt = sitofp <2 x i16> %shuf to <2 x double>
@@ -86,30 +70,14 @@ define <2 x double> @sitofp_2vf64_i8(<16 x i8> %a) {
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movd %xmm1, %rax
-; SSE2-NEXT: movsbq %al, %rax
-; SSE2-NEXT: movd %xmm0, %rcx
-; SSE2-NEXT: movsbq %cl, %rcx
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2sdq %rax, %xmm1
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: sitofp_2vf64_i8:
; AVX: # BB#0:
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: movsbq %al, %rax
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movsbq %cl, %rcx
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0
-; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%cvt = sitofp <2 x i8> %shuf to <2 x double>
diff --git a/test/CodeGen/X86/vec_shift8.ll b/test/CodeGen/X86/vec_shift8.ll
deleted file mode 100644
index 9d19f667ea9b2..0000000000000
--- a/test/CodeGen/X86/vec_shift8.ll
+++ /dev/null
@@ -1,527 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-
-;
-; Vectorized integer shifts
-;
-
-define <2 x i64> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp {
-entry:
-; ALL-NOT: shll
-;
-; SSE2: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psllw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psllw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psllw $2, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psllw $1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX: vpsllw $12, %xmm1, %xmm2
-; AVX-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm1
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm1
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $1, %xmm0, %xmm1
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %shl = shl <8 x i16> %r, %a
- %tmp2 = bitcast <8 x i16> %shl to <2 x i64>
- ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @shl_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
-entry:
-; SSE2: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psllw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psllw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm3, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
- %shl = shl <16 x i8> %r, %a
- %tmp2 = bitcast <16 x i8> %shl to <2 x i64>
- ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp {
-entry:
-; ALL-NOT: sarw
-;
-; SSE2: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psraw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psraw $4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psraw $2, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psraw $1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX: vpsllw $12, %xmm1, %xmm2
-; AVX-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vpsraw $8, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %ashr = ashr <8 x i16> %r, %a
- %tmp2 = bitcast <8 x i16> %ashr to <2 x i64>
- ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @ashr_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
-entry:
-; ALL-NOT: sarb
-;
-; SSE2: punpckhbw {{.*#}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $4, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $2, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: punpckhbw {{.*#}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: punpckhbw {{.*#}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $4, %xmm4
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $2, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT: punpcklbw {{.*#}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpunpckhbw {{.*#}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX-NEXT: vpunpckhbw {{.*#}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpunpcklbw {{.*#}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
- %ashr = ashr <16 x i8> %r, %a
- %tmp2 = bitcast <16 x i8> %ashr to <2 x i64>
- ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp {
-entry:
-; ALL-NOT: shrl
-;
-; SSE2: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $4, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX: vpsllw $12, %xmm1, %xmm2
-; AVX-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %lshr = lshr <8 x i16> %r, %a
- %tmp2 = bitcast <8 x i16> %lshr to <2 x i64>
- ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @lshr_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
-entry:
-; ALL-NOT: shrb
-;
-; SSE2: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $1, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
- %lshr = lshr <16 x i8> %r, %a
- %tmp2 = bitcast <16 x i8> %lshr to <2 x i64>
- ret <2 x i64> %tmp2
-}
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index e6acc7efaf394..aafc05b2ed4ce 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -117,6 +117,46 @@ entry:
ret <4 x i64>%B
}
+define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_2i8_to_i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_2i8_to_i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: psraw $8, %xmm0
+; SSSE3-NEXT: movd %xmm0, %eax
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_2i8_to_i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: sext_2i8_to_i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_2i8_to_i32:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: pmovsxbw %xmm0, %xmm0
+; X32-SSE41-NEXT: movd %xmm0, %eax
+; X32-SSE41-NEXT: popl %edx
+; X32-SSE41-NEXT: retl
+entry:
+ %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %Ex = sext <2 x i8> %Shuf to <2 x i16>
+ %Bc = bitcast <2 x i16> %Ex to i32
+ ret i32 %Bc
+}
+
define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
; SSE2-LABEL: load_sext_test1:
; SSE2: # BB#0: # %entry
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
new file mode 100644
index 0000000000000..4fd2f8b51b8b2
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -0,0 +1,1041 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Variable Shifts
+;
+
+define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: var_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movd %xmm1, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: pextrq $1, %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movd %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: sarq %cl, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: sarq %cl, %rax
+; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX-NEXT: retq
+ %shift = ashr <2 x i64> %a, %b
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: var_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSE2-NEXT: movd %xmm2, %ecx
+; SSE2-NEXT: sarl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSE2-NEXT: movd %xmm3, %ecx
+; SSE2-NEXT: sarl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: sarl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %ecx
+; SSE2-NEXT: sarl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $1, %xmm0, %eax
+; SSE41-NEXT: pextrd $1, %xmm1, %ecx
+; SSE41-NEXT: sarl %cl, %eax
+; SSE41-NEXT: movd %xmm0, %edx
+; SSE41-NEXT: movd %xmm1, %ecx
+; SSE41-NEXT: sarl %cl, %edx
+; SSE41-NEXT: movd %edx, %xmm2
+; SSE41-NEXT: pinsrd $1, %eax, %xmm2
+; SSE41-NEXT: pextrd $2, %xmm0, %eax
+; SSE41-NEXT: pextrd $2, %xmm1, %ecx
+; SSE41-NEXT: sarl %cl, %eax
+; SSE41-NEXT: pinsrd $2, %eax, %xmm2
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: pextrd $3, %xmm1, %ecx
+; SSE41-NEXT: sarl %cl, %eax
+; SSE41-NEXT: pinsrd $3, %eax, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %edx
+; AVX1-NEXT: vmovd %edx, %xmm2
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = ashr <4 x i32> %a, %b
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: var_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psraw $4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psraw $2, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psraw $1, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %shift = ashr <8 x i16> %a, %b
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: var_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = ashr <16 x i8> %a, %b
+ ret <16 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: splatvar_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movd %xmm2, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: sarq %cl, %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: pextrq $1, %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm2
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: movd %xmm1, %rcx
+; SSE41-NEXT: sarq %cl, %rax
+; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: splatvar_shift_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: retq
+ %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
+ %shift = ashr <2 x i64> %a, %splat
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: splatvar_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: psrad %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; SSE41-NEXT: psrad %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
+ %shift = ashr <4 x i32> %a, %splat
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: splatvar_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: psraw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT: psraw %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
+ %shift = ashr <8 x i16> %a, %splat
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: splatvar_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: psraw $1, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: splatvar_shift_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX2-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
+ %shift = ashr <16 x i8> %a, %splat
+ ret <16 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
+; SSE2-LABEL: constant_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: sarq %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: sarq $7, %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: sarq $7, %rax
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: sarq %rax
+; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: constant_shift_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: sarq $7, %rax
+; AVX-NEXT: vmovq %rax, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: sarq %rax
+; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %shift = ashr <2 x i64> %a, <i64 1, i64 7>
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
+; SSE2-LABEL: constant_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: sarl $7, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: sarl $5, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: sarl $4, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: sarl $6, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $1, %xmm0, %eax
+; SSE41-NEXT: sarl $5, %eax
+; SSE41-NEXT: movd %xmm0, %ecx
+; SSE41-NEXT: sarl $4, %ecx
+; SSE41-NEXT: movd %ecx, %xmm1
+; SSE41-NEXT: pinsrd $1, %eax, %xmm1
+; SSE41-NEXT: pextrd $2, %xmm0, %eax
+; SSE41-NEXT: sarl $6, %eax
+; SSE41-NEXT: pinsrd $2, %eax, %xmm1
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: sarl $7, %eax
+; SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: sarl $5, %eax
+; AVX1-NEXT: vmovd %xmm0, %ecx
+; AVX1-NEXT: sarl $4, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: sarl $6, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: sarl $7, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
+; SSE2-LABEL: constant_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psraw $4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $8, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
+; SSE2-LABEL: constant_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT: psllw $5, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: psraw $1, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT: psllw $5, %xmm3
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psraw $4, %xmm3
+; SSE41-NEXT: pblendvb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psraw $2, %xmm3
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psraw $1, %xmm3
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: constant_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+ ret <16 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
+; SSE2-LABEL: splatconstant_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: sarq $7, %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: sarq $7, %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatconstant_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: sarq $7, %rax
+; SSE41-NEXT: movd %rax, %xmm1
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: sarq $7, %rax
+; SSE41-NEXT: movd %rax, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: sarq $7, %rax
+; AVX-NEXT: vmovq %rax, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: sarq $7, %rax
+; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %shift = ashr <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
+; SSE-LABEL: splatconstant_shift_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: psrad $5, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrad $5, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
+; SSE-LABEL: splatconstant_shift_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: psraw $3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
+; SSE-LABEL: splatconstant_shift_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <16 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
new file mode 100644
index 0000000000000..3fc377af56500
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -0,0 +1,767 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Variable Shifts
+;
+
+define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: var_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: vmovq %xmm3, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm4
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: vmovq %xmm3, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm3
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <4 x i64> %a, %b
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: var_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrd $1, %xmm2, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrd $1, %xmm3, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vmovd %xmm2, %edx
+; AVX1-NEXT: vmovd %xmm3, %ecx
+; AVX1-NEXT: sarl %cl, %edx
+; AVX1-NEXT: vmovd %edx, %xmm4
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
+; AVX1-NEXT: vpextrd $2, %xmm2, %eax
+; AVX1-NEXT: vpextrd $2, %xmm3, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX1-NEXT: vpextrd $3, %xmm2, %eax
+; AVX1-NEXT: vpextrd $3, %xmm3, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %edx
+; AVX1-NEXT: vmovd %edx, %xmm3
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
+; AVX1-NEXT: sarl %cl, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <8 x i32> %a, %b
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: var_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
+; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <16 x i16> %a, %b
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: var_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
+; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
+; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
+; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
+; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
+; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <32 x i8> %a, %b
+ ret <32 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: splatvar_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: movb %al, %cl
+; AVX1-NEXT: sarq %cl, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm3
+; AVX1-NEXT: vmovq %xmm2, %rsi
+; AVX1-NEXT: vmovq %xmm1, %rdx
+; AVX1-NEXT: movb %dl, %cl
+; AVX1-NEXT: sarq %cl, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT: movb %al, %cl
+; AVX1-NEXT: sarq %cl, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movb %dl, %cl
+; AVX1-NEXT: sarq %cl, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm4
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: vmovq %xmm3, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm3
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: sarq %cl, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
+ %shift = ashr <4 x i64> %a, %splat
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: splatvar_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+ %shift = ashr <8 x i32> %a, %splat
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: splatvar_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
+ %shift = ashr <16 x i16> %a, %splat
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: splatvar_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm9
+; AVX1-NEXT: vpblendvb %xmm9, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm8
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; AVX1-NEXT: vpsraw $4, %xmm3, %xmm5
+; AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $2, %xmm3, %xmm5
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm3, %xmm5
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm7
+; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsraw $4, %xmm5, %xmm3
+; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm5, %xmm2
+; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3
+; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3
+; AVX1-NEXT: vpblendvb %xmm9, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
+; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
+; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
+; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
+; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
+ %shift = ashr <32 x i8> %a, %splat
+ ret <32 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
+; AVX1-LABEL: constant_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: sarq $62, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: sarq $31, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: sarq $7, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: sarq %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: sarq $62, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: sarq $31, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: sarq $7, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: sarq %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
+; AVX1-LABEL: constant_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; AVX1-NEXT: sarl $9, %eax
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: sarl $8, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $2, %xmm1, %eax
+; AVX1-NEXT: sarl $8, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; AVX1-NEXT: sarl $7, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: sarl $5, %eax
+; AVX1-NEXT: vmovd %xmm0, %ecx
+; AVX1-NEXT: sarl $4, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: sarl $6, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: sarl $7, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
+; AVX1-LABEL: constant_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
+; AVX1-LABEL: constant_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
+; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm9
+; AVX1-NEXT: vpblendvb %xmm9, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm8
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; AVX1-NEXT: vpsraw $4, %xmm3, %xmm5
+; AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $2, %xmm3, %xmm5
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm3, %xmm5
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm7
+; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsraw $4, %xmm5, %xmm3
+; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm5, %xmm2
+; AVX1-NEXT: vpsraw $2, %xmm2, %xmm3
+; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $1, %xmm2, %xmm3
+; AVX1-NEXT: vpblendvb %xmm9, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
+; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
+; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
+; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
+; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+ ret <32 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
+; AVX1-LABEL: splatconstant_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: sarq $7, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: sarq $7, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: sarq $7, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: sarq $7, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: sarq $7, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: sarq $7, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: sarq $7, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: sarq $7, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
+; AVX1-LABEL: splatconstant_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
+; AVX1-LABEL: splatconstant_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
+; AVX1-LABEL: splatconstant_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <32 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
new file mode 100644
index 0000000000000..f5a7e28383fe5
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -0,0 +1,778 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Variable Shifts
+;
+
+define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: var_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrlq %xmm3, %xmm2
+; SSE2-NEXT: psrlq %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrlq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: psrlq %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = lshr <2 x i64> %a, %b
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: var_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; SSE2-NEXT: movd %xmm2, %ecx
+; SSE2-NEXT: shrl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSE2-NEXT: movd %xmm3, %ecx
+; SSE2-NEXT: shrl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: shrl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %ecx
+; SSE2-NEXT: shrl %cl, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $1, %xmm0, %eax
+; SSE41-NEXT: pextrd $1, %xmm1, %ecx
+; SSE41-NEXT: shrl %cl, %eax
+; SSE41-NEXT: movd %xmm0, %edx
+; SSE41-NEXT: movd %xmm1, %ecx
+; SSE41-NEXT: shrl %cl, %edx
+; SSE41-NEXT: movd %edx, %xmm2
+; SSE41-NEXT: pinsrd $1, %eax, %xmm2
+; SSE41-NEXT: pextrd $2, %xmm0, %eax
+; SSE41-NEXT: pextrd $2, %xmm1, %ecx
+; SSE41-NEXT: shrl %cl, %eax
+; SSE41-NEXT: pinsrd $2, %eax, %xmm2
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: pextrd $3, %xmm1, %ecx
+; SSE41-NEXT: shrl %cl, %eax
+; SSE41-NEXT: pinsrd $3, %eax, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %edx
+; AVX1-NEXT: vmovd %edx, %xmm2
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = lshr <4 x i32> %a, %b
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: var_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $2, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %shift = lshr <8 x i16> %a, %b
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: var_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = lshr <16 x i8> %a, %b
+ ret <16 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: splatvar_shift_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: psrlq %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
+ %shift = lshr <2 x i64> %a, %splat
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: splatvar_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: psrld %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; SSE41-NEXT: psrld %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
+ %shift = lshr <4 x i32> %a, %splat
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: splatvar_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT: psrlw %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
+ %shift = lshr <8 x i16> %a, %splat
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: splatvar_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm4
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $2, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: splatvar_shift_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
+ %shift = lshr <16 x i8> %a, %splat
+ ret <16 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
+; SSE2-LABEL: constant_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlq $7, %xmm1
+; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlq $7, %xmm1
+; SSE41-NEXT: psrlq $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = lshr <2 x i64> %a, <i64 1, i64 7>
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
+; SSE2-LABEL: constant_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: shrl $7, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: shrl $5, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: shrl $4, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: shrl $6, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $1, %xmm0, %eax
+; SSE41-NEXT: shrl $5, %eax
+; SSE41-NEXT: movd %xmm0, %ecx
+; SSE41-NEXT: shrl $4, %ecx
+; SSE41-NEXT: movd %ecx, %xmm1
+; SSE41-NEXT: pinsrd $1, %eax, %xmm1
+; SSE41-NEXT: pextrd $2, %xmm0, %eax
+; SSE41-NEXT: shrl $6, %eax
+; SSE41-NEXT: pinsrd $2, %eax, %xmm1
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: shrl $7, %eax
+; SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: vmovd %xmm0, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
+; SSE2-LABEL: constant_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $4, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm2 # xmm2 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
+; SSE2-LABEL: constant_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT: psllw $5, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $4, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: constant_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+ ret <16 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
+; SSE-LABEL: splatconstant_shift_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: psrlq $7, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = lshr <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
+; SSE-LABEL: splatconstant_shift_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: psrld $5, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
+; SSE-LABEL: splatconstant_shift_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
+; SSE-LABEL: splatconstant_shift_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpsrlw $3, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0
+; AVX-NEXT: retq
+ %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <16 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
new file mode 100644
index 0000000000000..d200abd5f8755
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -0,0 +1,548 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Variable Shifts
+;
+
+define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: var_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <4 x i64> %a, %b
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: var_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrd $1, %xmm2, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrd $1, %xmm3, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vmovd %xmm2, %edx
+; AVX1-NEXT: vmovd %xmm3, %ecx
+; AVX1-NEXT: shrl %cl, %edx
+; AVX1-NEXT: vmovd %edx, %xmm4
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
+; AVX1-NEXT: vpextrd $2, %xmm2, %eax
+; AVX1-NEXT: vpextrd $2, %xmm3, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX1-NEXT: vpextrd $3, %xmm2, %eax
+; AVX1-NEXT: vpextrd $3, %xmm3, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %edx
+; AVX1-NEXT: vmovd %edx, %xmm3
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
+; AVX1-NEXT: shrl %cl, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <8 x i32> %a, %b
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: var_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
+; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <16 x i16> %a, %b
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: var_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <32 x i8> %a, %b
+ ret <32 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: splatvar_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
+ %shift = lshr <4 x i64> %a, %splat
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: splatvar_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+ %shift = lshr <8 x i32> %a, %splat
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: splatvar_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
+ %shift = lshr <16 x i16> %a, %splat
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: splatvar_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
+ %shift = lshr <32 x i8> %a, %splat
+ ret <32 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
+; AVX1-LABEL: constant_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
+; AVX1-LABEL: constant_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; AVX1-NEXT: shrl $9, %eax
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $2, %xmm1, %eax
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $3, %xmm1, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: shrl $5, %eax
+; AVX1-NEXT: vmovd %xmm0, %ecx
+; AVX1-NEXT: shrl $4, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: shrl $6, %eax
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: shrl $7, %eax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
+; AVX1-LABEL: constant_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
+; AVX1-LABEL: constant_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+ ret <32 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
+; AVX1-LABEL: splatconstant_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
+; AVX1-LABEL: splatconstant_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
+; AVX1-LABEL: splatconstant_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
+; AVX1-LABEL: splatconstant_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrlw $3, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0
+; AVX2-NEXT: retq
+ %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <32 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
new file mode 100644
index 0000000000000..3ac31ea636765
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -0,0 +1,639 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Variable Shifts
+;
+
+define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: var_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllq %xmm3, %xmm2
+; SSE2-NEXT: psllq %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: psllq %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = shl <2 x i64> %a, %b
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: var_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = shl <4 x i32> %a, %b
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: var_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psllw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psllw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $2, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $1, %xmm1
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %shift = shl <8 x i16> %a, %b
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: var_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = shl <16 x i8> %a, %b
+ ret <16 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: splatvar_shift_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: psllq %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
+ %shift = shl <2 x i64> %a, %splat
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: splatvar_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: pslld %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; SSE41-NEXT: pslld %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
+ %shift = shl <4 x i32> %a, %splat
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: splatvar_shift_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movzwl %ax, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT: psllw %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: splatvar_shift_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
+ %shift = shl <8 x i16> %a, %splat
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: splatvar_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: splatvar_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psllw $4, %xmm4
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psllw $2, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: splatvar_shift_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
+ %shift = shl <16 x i8> %a, %splat
+ ret <16 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
+; SSE2-LABEL: constant_shift_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllq $7, %xmm1
+; SSE2-NEXT: psllq $1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq $7, %xmm1
+; SSE41-NEXT: psllq $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
+; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = shl <2 x i64> %a, <i64 1, i64 7>
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
+; SSE2-LABEL: constant_shift_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
+; SSE-LABEL: constant_shift_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: constant_shift_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
+; SSE2-LABEL: constant_shift_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: constant_shift_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; SSE41-NEXT: psllw $5, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllw $4, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psllw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: paddb %xmm2, %xmm2
+; SSE41-NEXT: paddb %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: constant_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+ ret <16 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
+; SSE-LABEL: splatconstant_shift_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: psllq $7, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllq $7, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = shl <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %shift
+}
+
+define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
+; SSE-LABEL: splatconstant_shift_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $5, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
+ ret <4 x i32> %shift
+}
+
+define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
+; SSE-LABEL: splatconstant_shift_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <8 x i16> %shift
+}
+
+define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
+; SSE-LABEL: splatconstant_shift_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: splatconstant_shift_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpsllw $3, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0
+; AVX-NEXT: retq
+ %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <16 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
new file mode 100644
index 0000000000000..7c13c0ae4716d
--- /dev/null
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -0,0 +1,459 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Variable Shifts
+;
+
+define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: var_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <4 x i64> %a, %b
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: var_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <8 x i32> %a, %b
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: var_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsllw $8, %xmm4, %xmm5
+; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $2, %xmm2, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $1, %xmm2, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm4
+; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <16 x i16> %a, %b
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: var_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <32 x i8> %a, %b
+ ret <32 x i8> %shift
+}
+
+;
+; Uniform Variable Shifts
+;
+
+define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: splatvar_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
+ %shift = shl <4 x i64> %a, %splat
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: splatvar_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+ %shift = shl <8 x i32> %a, %splat
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: splatvar_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
+ %shift = shl <16 x i16> %a, %splat
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: splatvar_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7
+; AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
+ %shift = shl <32 x i8> %a, %splat
+ ret <32 x i8> %shift
+}
+
+;
+; Constant Shifts
+;
+
+define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
+; AVX1-LABEL: constant_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsllq $62, %xmm1, %xmm2
+; AVX1-NEXT: vpsllq $31, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpsllq $7, %xmm0, %xmm2
+; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
+; AVX1-LABEL: constant_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
+; AVX1-LABEL: constant_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
+; AVX1-LABEL: constant_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*}}(%rip), %xmm4 # xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6
+; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7
+; AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+ ret <32 x i8> %shift
+}
+
+;
+; Uniform Constant Shifts
+;
+
+define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
+; AVX1-LABEL: splatconstant_shift_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsllq $7, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %shift
+}
+
+define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
+; AVX1-LABEL: splatconstant_shift_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpslld $5, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpslld $5, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+ ret <8 x i32> %shift
+}
+
+define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
+; AVX1-LABEL: splatconstant_shift_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <16 x i16> %shift
+}
+
+define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
+; AVX1-LABEL: splatconstant_shift_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatconstant_shift_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <32 x i8> %shift
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 53d13c86657b5..124d6e8c8ba2a 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -653,28 +653,28 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE2: # BB#0:
-; SSE2-NEXT: shll $8, %edi
-; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: shll $8, %edi
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $2, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSSE3: # BB#0:
-; SSSE3-NEXT: shll $8, %edi
-; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: shll $8, %edi
+; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE41: # BB#0:
-; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $5, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $5, %edi, %xmm0
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -684,28 +684,28 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE2: # BB#0:
-; SSE2-NEXT: shll $8, %edi
-; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: shll $8, %edi
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $7, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSSE3: # BB#0:
-; SSSE3-NEXT: shll $8, %edi
-; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: shll $8, %edi
+; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; SSE41: # BB#0:
-; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $15, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $15, %edi, %xmm0
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 0
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
@@ -716,27 +716,27 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE2: # BB#0:
; SSE2-NEXT: movzbl %dil, %eax
-; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $1, %eax, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSSE3: # BB#0:
; SSSE3-NEXT: movzbl %dil, %eax
-; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; SSE41: # BB#0:
-; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $2, %edi, %xmm0
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a = insertelement <16 x i8> undef, i8 %i, i32 3
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1341,12 +1341,12 @@ define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(
define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
; SSE: # BB#0:
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
; AVX: # BB#0:
-; AVX-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: retq
%shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
%bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 4007f0b2b13bf..6a29d33d6c5e7 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1384,14 +1384,14 @@ define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $1, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $1, %edi, %xmm0
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 0
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
@@ -1401,14 +1401,14 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $5, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $5, %edi, %xmm0
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 0
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
@@ -1418,14 +1418,14 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $7, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, %edi, %xmm0
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 0
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
@@ -1435,14 +1435,14 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $2, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $2, %edi, %xmm0
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%a = insertelement <8 x i16> undef, i16 %i, i32 3
%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 944ec4b8d3ac7..62bf288a870d1 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -810,30 +810,20 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
}
define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
-; AVX1-LABEL: insert_reg_and_zero_v4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq %rdi, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_reg_and_zero_v4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: retq
+; ALL-LABEL: insert_reg_and_zero_v4i64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq %rdi, %xmm0
+; ALL-NEXT: retq
%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x i64> %shuffle
}
define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
-; AVX1-LABEL: insert_mem_and_zero_v4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
+; ALL-LABEL: insert_mem_and_zero_v4i64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -874,15 +864,10 @@ define <4 x double> @splat_mem_v4f64(double* %ptr) {
}
define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
-; AVX1-LABEL: splat_mem_v4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splat_mem_v4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: splat_mem_v4i64:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -915,6 +900,60 @@ define <4 x double> @splat_v4f64(<2 x double> %r) {
ret <4 x double> %1
}
+define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
+; AVX1-LABEL: splat_mem_v4i64_from_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splat_mem_v4i64_from_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT: retq
+ %v = load <2 x i64>, <2 x i64>* %ptr
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
+; AVX1-LABEL: splat_mem_v4f64_from_v2f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splat_mem_v4f64_from_v2f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT: retq
+ %v = load <2 x double>, <2 x double>* %ptr
+ %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x double> %shuffle
+}
+
+define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
+; ALL-LABEL: splat128_mem_v4i64_from_v2i64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovaps (%rdi), %xmm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %v = load <2 x i64>, <2 x i64>* %ptr
+ %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
+; ALL-LABEL: splat128_mem_v4f64_from_v2f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovaps (%rdi), %xmm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %v = load <2 x double>, <2 x double>* %ptr
+ %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x double> %shuffle
+}
+
define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: bitcast_v4f64_0426:
; AVX1: # BB#0:
@@ -923,7 +962,7 @@ define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
;
; AVX2-LABEL: bitcast_v4f64_0426:
; AVX2: # BB#0:
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX2-NEXT: retq
%shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
%bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index bb07077b5559c..bc72e0a661777 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -2088,15 +2088,10 @@ entry:
}
define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) {
-; AVX1-LABEL: insert_mem_and_zero_v8i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v8i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: retq
+; ALL-LABEL: insert_mem_and_zero_v8i32:
+; ALL: # BB#0:
+; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: retq
%a = load i32, i32* %ptr
%v = insertelement <8 x i32> undef, i32 %a, i32 0
%shuffle = shufflevector <8 x i32> %v, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 2c6c8a3e7ade3..62d4af7809b6b 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -15,8 +15,9 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00000010:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x double> %shuffle
@@ -25,8 +26,9 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00000200:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -35,8 +37,9 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00003000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -45,8 +48,11 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00040000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -55,8 +61,11 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00500000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,0]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -65,8 +74,11 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_06000000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,0]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -75,11 +87,11 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_70000000:
; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: movl $7, %eax
-; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
-; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,0,0,0]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -88,7 +100,10 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_01014545:
; ALL: # BB#0:
-; ALL-NEXT: vshuff64x2 $160, %zmm0, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -97,8 +112,9 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00112233:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x double> %shuffle
@@ -107,8 +123,9 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00001111:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x double> %shuffle
@@ -117,7 +134,11 @@ define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_81a3c5e7:
; ALL: # BB#0:
-; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x double> %shuffle
@@ -126,9 +147,10 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08080808:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x double> %shuffle
@@ -137,9 +159,15 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08084c4c:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x double> %shuffle
@@ -148,9 +176,13 @@ define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_8823cc67:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -159,9 +191,13 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_9832dc76:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x double> %shuffle
@@ -170,9 +206,13 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_9810dc54:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
+; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x double> %shuffle
@@ -181,9 +221,15 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08194c5d:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x double> %shuffle
@@ -192,9 +238,15 @@ define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_2a3b6e7f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x double> %shuffle
@@ -203,9 +255,13 @@ define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08192a3b:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x double> %shuffle
@@ -214,9 +270,11 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08991abb:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x double> %shuffle
@@ -225,9 +283,12 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_091b2d3f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x double> %shuffle
@@ -236,9 +297,11 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_09ab1def:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x double> %shuffle
@@ -247,7 +310,10 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00014445:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $64, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -256,7 +322,10 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00204464:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $32, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x double> %shuffle
@@ -265,7 +334,10 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_03004744:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $12, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -274,7 +346,10 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10005444:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $1, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -283,7 +358,10 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_22006644:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $10, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -292,7 +370,10 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_33307774:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $63, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -301,7 +382,10 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_32107654:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $27, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x double> %shuffle
@@ -310,7 +394,10 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00234467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -319,7 +406,10 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00224466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -328,7 +418,10 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10325476:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x double> %shuffle
@@ -337,7 +430,10 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_11335577:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x double> %shuffle
@@ -346,7 +442,10 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -355,7 +454,10 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10225466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -364,8 +466,10 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00015444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -374,8 +478,10 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00204644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -384,8 +490,10 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_03004474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -394,8 +502,10 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10004444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -404,8 +514,10 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_22006446:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x double> %shuffle
@@ -414,8 +526,10 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_33307474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -424,8 +538,9 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_32104567:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -434,8 +549,10 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00236744:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -444,8 +561,10 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00226644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -454,7 +573,9 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10324567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -463,7 +584,9 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_11334567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -472,7 +595,9 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_01235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -481,7 +606,9 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_01235466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -490,8 +617,10 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_002u6u44:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -500,8 +629,10 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00uu66uu:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -510,7 +641,9 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_103245uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -519,7 +652,9 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_1133uu67:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -528,7 +663,9 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_0uu354uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -537,7 +674,9 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_uuu3uu66:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -546,9 +685,16 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_c348cda0:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm4
+; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
ret <8 x double> %shuffle
@@ -557,9 +703,17 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_f511235a:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,3,1,3]
+; ALL-NEXT: vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
ret <8 x double> %shuffle
@@ -577,8 +731,9 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00000010:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x i64> %shuffle
@@ -587,8 +742,9 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00000200:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -597,8 +753,9 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00003000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -607,8 +764,11 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00040000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -617,8 +777,11 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00500000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -627,8 +790,11 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_06000000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,0]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -637,11 +803,11 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_70000000:
; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: movl $7, %eax
-; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
-; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,0,0,0]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -650,7 +816,10 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_01014545:
; ALL: # BB#0:
-; ALL-NEXT: vshufi64x2 $160, %zmm0, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle
@@ -659,8 +828,9 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00112233:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i64> %shuffle
@@ -669,8 +839,9 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00001111:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i64> %shuffle
@@ -679,7 +850,11 @@ define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_81a3c5e7:
; ALL: # BB#0:
-; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x i64> %shuffle
@@ -688,9 +863,10 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08080808:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x i64> %shuffle
@@ -699,9 +875,15 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08084c4c:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x i64> %shuffle
@@ -710,9 +892,13 @@ define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_8823cc67:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -721,9 +907,13 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_9832dc76:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -732,9 +922,13 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_9810dc54:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -743,9 +937,15 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08194c5d:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x i64> %shuffle
@@ -754,9 +954,15 @@ define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_2a3b6e7f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x i64> %shuffle
@@ -765,9 +971,13 @@ define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08192a3b:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x i64> %shuffle
@@ -776,9 +986,11 @@ define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08991abb:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,0,1,1]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x i64> %shuffle
@@ -787,9 +999,12 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_091b2d3f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x i64> %shuffle
@@ -798,9 +1013,11 @@ define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_09ab1def:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x i64> %shuffle
@@ -809,7 +1026,10 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00014445:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $64, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x i64> %shuffle
@@ -818,7 +1038,10 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00204464:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $32, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x i64> %shuffle
@@ -827,7 +1050,10 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_03004744:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $12, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -836,7 +1062,10 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10005444:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $1, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -845,7 +1074,10 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_22006644:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $10, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -854,7 +1086,10 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_33307774:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $63, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -863,7 +1098,10 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_32107654:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $27, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -872,7 +1110,10 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00234467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -881,7 +1122,10 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00224466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -890,7 +1134,10 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10325476:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -899,7 +1146,10 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_11335577:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i64> %shuffle
@@ -908,7 +1158,10 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -917,7 +1170,10 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10225466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -926,8 +1182,10 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00015444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -936,8 +1194,10 @@ define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00204644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -946,8 +1206,10 @@ define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_03004474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -956,8 +1218,10 @@ define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10004444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -966,8 +1230,10 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_22006446:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x i64> %shuffle
@@ -976,8 +1242,10 @@ define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_33307474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -986,8 +1254,9 @@ define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_32104567:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -996,8 +1265,10 @@ define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00236744:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1006,8 +1277,10 @@ define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00226644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1016,7 +1289,9 @@ define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10324567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1025,7 +1300,9 @@ define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_11334567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1034,7 +1311,9 @@ define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_01235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1043,7 +1322,9 @@ define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_01235466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1052,8 +1333,10 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_002u6u44:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1062,8 +1345,10 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00uu66uu:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1072,7 +1357,9 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_103245uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1081,7 +1368,9 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_1133uu67:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1090,7 +1379,9 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_0uu354uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1099,7 +1390,9 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_uuu3uu66:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1108,9 +1401,15 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_6caa87e5:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
ret <8 x i64> %shuffle
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index a2f3d7b82b369..0a6eea049d372 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: cvtsi2ss
+; CHECK: cvtdq2ps
; sign to float v2i16 to v2f32
diff --git a/test/CodeGen/X86/win64_params.ll b/test/CodeGen/X86/win64_params.ll
index 9718c86300c25..a0b552d4d5847 100644
--- a/test/CodeGen/X86/win64_params.ll
+++ b/test/CodeGen/X86/win64_params.ll
@@ -7,8 +7,7 @@ define i32 @f6(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind re
entry:
; CHECK: movl 48(%rsp), %eax
; CHECK: addl 40(%rsp), %eax
-; LINUX: addl %r9d, %r8d
-; LINUX: movl %r8d, %eax
+; LINUX: leal (%r8,%r9), %eax
%add = add nsw i32 %p6, %p5
ret i32 %add
}
@@ -27,10 +26,8 @@ entry:
; on other platforms here (note the x86_64_sysvcc calling convention).
define x86_64_sysvcc i32 @f8(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize {
entry:
-; CHECK: addl %r9d, %r8d
-; CHECK: movl %r8d, %eax
-; LINUX: addl %r9d, %r8d
-; LINUX: movl %r8d, %eax
+; CHECK: leal (%r8,%r9), %eax
+; LINUX: leal (%r8,%r9), %eax
%add = add nsw i32 %p6, %p5
ret i32 %add
}
diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll
index 199557dac2061..77c37b4d348e2 100644
--- a/test/CodeGen/X86/win_cst_pool.ll
+++ b/test/CodeGen/X86/win_cst_pool.ll
@@ -64,3 +64,16 @@ define <4 x float> @undef1() {
; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0
; CHECK-NEXT: ret
}
+
+define float @pr23966(i32 %a) {
+ %tobool = icmp ne i32 %a, 0
+ %sel = select i1 %tobool, float -1.000000e+00, float 1.000000e+00
+ ret float %sel
+}
+
+; CHECK: .globl __real@bf8000003f800000
+; CHECK-NEXT: .section .rdata,"dr",discard,__real@bf8000003f800000
+; CHECK-NEXT: .align 4
+; CHECK-NEXT: __real@bf8000003f800000:
+; CHECK-NEXT: .long 1065353216
+; CHECK-NEXT: .long 3212836864
diff --git a/test/CodeGen/X86/win_ftol2.ll b/test/CodeGen/X86/win_ftol2.ll
index 14591248f354e..dfa6e3aa76bdd 100644
--- a/test/CodeGen/X86/win_ftol2.ll
+++ b/test/CodeGen/X86/win_ftol2.ll
@@ -142,3 +142,25 @@ define i64 @double_ui64_5(double %X) {
%tmp.1 = fptoui double %X to i64
ret i64 %tmp.1
}
+
+define double @pr23957_32(double %A) {
+; FTOL-LABEL: @pr23957_32
+; FTOL: fldl
+; FTOL-NEXT: fld %st(0)
+; FTOL-NEXT: calll __ftol2
+ %B = fptoui double %A to i32
+ %C = uitofp i32 %B to double
+ %D = fsub double %C, %A
+ ret double %D
+}
+
+define double @pr23957_64(double %A) {
+; FTOL-LABEL: @pr23957_64
+; FTOL: fldl
+; FTOL-NEXT: fld %st(0)
+; FTOL-NEXT: calll __ftol2
+ %B = fptoui double %A to i64
+ %C = uitofp i64 %B to double
+ %D = fsub double %C, %A
+ ret double %D
+}
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index 829be41e51279..f78fe27578651 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s -check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 | FileCheck %s -check-prefix=X64
@@ -193,3 +193,22 @@ define i32 @test11(i32 %b) {
; X32: movl $-2, %[[REG:.*]]
; X32: roll %{{.*}}, %[[REG]]
}
+
+%struct.ref_s = type { %union.v, i16, i16 }
+%union.v = type { i64 }
+
+define %struct.ref_s* @test12(%struct.ref_s* %op, i64 %osbot, i64 %intval) {
+ %neg = shl i64 %intval, 32
+ %sext = xor i64 %neg, -4294967296
+ %idx.ext = ashr exact i64 %sext, 32
+ %add.ptr = getelementptr inbounds %struct.ref_s, %struct.ref_s* %op, i64 %idx.ext
+ ret %struct.ref_s* %add.ptr
+; X64-LABEL: test12:
+; X64: shlq $32, %[[REG:.*]]
+; X64-NOT: not
+; X64: sarq $28, %[[REG]]
+; X32-LABEL: test12:
+; X32: leal
+; X32-NOT: not
+; X32: shll $2, %eax
+}