171 files changed, 14820 insertions, 5264 deletions
diff --git a/test/Analysis/CostModel/AArch64/store.ll b/test/Analysis/CostModel/AArch64/store.ll
index 58750721cb977..085863554f007 100644
--- a/test/Analysis/CostModel/AArch64/store.ll
+++ b/test/Analysis/CostModel/AArch64/store.ll
@@ -1,17 +1,59 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=aarch64-apple-ios | FileCheck %s
-; RUN: opt < %s  -cost-model -analyze -mtriple=aarch64-apple-ios -mattr=slow-misaligned-128store | FileCheck %s --check-prefix=SLOW_MISALIGNED_128_STORE
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-unknown | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-unknown -mattr=slow-misaligned-128store | FileCheck %s --check-prefix=SLOW_MISALIGNED_128_STORE
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 ; CHECK-LABEL: getMemoryOpCost
 ; SLOW_MISALIGNED_128_STORE-LABEL: getMemoryOpCost
 define void @getMemoryOpCost() {
-    ; If FeatureSlowMisaligned128Store is set, we penalize <2 x i64> stores. On
-    ; Cyclone, for example, such stores should be expensive because we don't
-    ; split them and misaligned 16b stores have bad performance.
-    ;
-    ; CHECK: cost of 1 {{.*}} store
-    ; SLOW_MISALIGNED_128_STORE: cost of 12 {{.*}} store
+    ; If FeatureSlowMisaligned128Store is set, we penalize 128-bit stores.
+    ; The unlegalized 256-bit stores are further penalized when legalized down
+    ; to 128-bit stores. 
+
+    ; CHECK: cost of 2 for {{.*}} store <4 x i64>
+    ; SLOW_MISALIGNED_128_STORE: cost of 24 for {{.*}} store <4 x i64>
+    store <4 x i64> undef, <4 x i64> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <8 x i32>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <8 x i32>
+    store <8 x i32> undef, <8 x i32> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <16 x i16>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <16 x i16>
+    store <16 x i16> undef, <16 x i16> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <32 x i8>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <32 x i8>
+    store <32 x i8> undef, <32 x i8> * undef
+
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <4 x double>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <4 x double>
+    store <4 x double> undef, <4 x double> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <8 x float>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <8 x float>
+    store <8 x float> undef, <8 x float> * undef
+    ; CHECK-NEXT: cost of 2 for {{.*}} store <16 x half>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <16 x half>
+    store <16 x half> undef, <16 x half> * undef
+
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <2 x i64>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <2 x i64>
     store <2 x i64> undef, <2 x i64> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <4 x i32>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <4 x i32>
+    store <4 x i32> undef, <4 x i32> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <8 x i16>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <8 x i16>
+    store <8 x i16> undef, <8 x i16> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <16 x i8>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <16 x i8>
+    store <16 x i8> undef, <16 x i8> * undef
+
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <2 x double>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <2 x double>
+    store <2 x double> undef, <2 x double> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <4 x float>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <4 x float>
+    store <4 x float> undef, <4 x float> * undef
+    ; CHECK-NEXT: cost of 1 for {{.*}} store <8 x half>
+    ; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <8 x half>
+    store <8 x half> undef, <8 x half> * undef
 
     ; We scalarize the loads/stores because there is no vector register name for
     ; these types (they get extended to v.4h/v.2s).
diff --git a/test/Analysis/CostModel/X86/slm-arith-costs.ll b/test/Analysis/CostModel/X86/slm-arith-costs.ll
new file mode 100644
index 0000000000000..3673a5d9e0673
--- /dev/null
+++ b/test/Analysis/CostModel/X86/slm-arith-costs.ll
@@ -0,0 +1,317 @@
+; RUN: opt < %s -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; 8bit mul
+define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i8
+  %res = mul nsw i8 %a, %b
+  ret i8 %res
+}
+
+define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i8>
+  %res = mul nsw <2 x i8> %a, %b
+  ret <2 x i8> %res
+}
+
+define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i8>
+  %res = mul nsw <4 x i8> %a, %b
+  ret <4 x i8> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32> 
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 255, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 255, i32 -1, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_zext_mul_fail_2(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 255, i32 256, i32 255, i32 255>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 3 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 127, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -128, i32 128, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_8_v4_sext_mul_fail_2(<4 x i8> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i8> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 127, i32 -129, i32 127, i32 -128>
+  ret <4 x i32> %res
+}
+
+define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} mul nsw <8 x i8>
+  %res = mul nsw <8 x i8> %a, %b
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b)  {
+entry:
+; SLM:  cost of 14 {{.*}} mul nsw <16 x i8>
+  %res = mul nsw <16 x i8> %a, %b
+  ret <16 x i8> %res
+}
+
+; 16bit mul
+define i16 @slm-costs_16_scalar_mul(i16 %a, i16 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i16
+  %res = mul nsw i16 %a, %b
+  ret i16 %res
+}
+
+define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i16>
+  %res = mul nsw <2 x i16> %a, %b
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i16>
+  %res = mul nsw <4 x i16> %a, %b
+  ret <4 x i16> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 65535, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 -1, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_zext_mul_fail_2(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %zext = zext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %zext, <i32 65536, i32 65535, i32 65535, i32 65535>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 5 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32768>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32768, i32 -32768>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @slm-costs_16_v4_sext_mul_fail_2(<4 x i16> %a)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %sext = sext <4 x i16> %a to <4 x i32>
+  %res = mul nsw <4 x i32> %sext, <i32 32767, i32 -32768, i32 32767, i32 -32769>
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @slm-costs_16_v8_mul(<8 x i16> %a, <8 x i16> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} mul nsw <8 x i16>
+  %res = mul nsw <8 x i16> %a, %b
+  ret <8 x i16> %res
+}
+
+define <16 x i16> @slm-costs_16_v16_mul(<16 x i16> %a, <16 x i16> %b)  {
+entry:
+; SLM:  cost of 4 {{.*}} mul nsw <16 x i16>
+  %res = mul nsw <16 x i16> %a, %b
+  ret <16 x i16> %res
+}
+
+; 32bit mul
+define i32 @slm-costs_32_scalar_mul(i32 %a, i32 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i32
+  %res = mul nsw i32 %a, %b
+  ret i32 %res 
+}
+
+define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i32>
+  %res = mul nsw <2 x i32> %a, %b
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @slm-costs_32_v4_mul(<4 x i32> %a, <4 x i32> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <4 x i32>
+  %res = mul nsw <4 x i32> %a, %b
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @slm-costs_32_v8_mul(<8 x i32> %a, <8 x i32> %b)  {
+entry:
+; SLM:  cost of 22 {{.*}} mul nsw <8 x i32>
+  %res = mul nsw <8 x i32> %a, %b
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @slm-costs_32_v16_mul(<16 x i32> %a, <16 x i32> %b)  {
+entry:
+; SLM:  cost of 44 {{.*}} mul nsw <16 x i32>
+  %res = mul nsw <16 x i32> %a, %b
+  ret <16 x i32> %res
+}
+
+; 64bit mul
+define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b)  {
+entry:
+; SLM:  cost of 1 {{.*}} mul nsw i64
+  %res = mul nsw i64 %a, %b
+  ret i64 %res
+}
+
+define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b)  {
+entry:
+; SLM:  cost of 11 {{.*}} mul nsw <2 x i64>
+  %res = mul nsw <2 x i64> %a, %b
+  ret <2 x i64> %res
+}
+
+define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b)  {
+entry:
+; SLM:  cost of 22 {{.*}} mul nsw <4 x i64>
+  %res = mul nsw <4 x i64> %a, %b
+  ret <4 x i64> %res
+}
+
+define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b)  {
+entry:
+; SLM:  cost of 44 {{.*}} mul nsw <8 x i64>
+  %res = mul nsw <8 x i64> %a, %b
+  ret <8 x i64> %res
+}
+
+define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b)  {
+entry:
+; SLM:  cost of 88 {{.*}} mul nsw <16 x i64>
+  %res = mul nsw <16 x i64> %a, %b
+  ret <16 x i64> %res
+}
+
+; mulsd
+define double @slm-costs_mulsd(double %a, double %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fmul double
+  %res = fmul double %a, %b
+  ret double %res
+}
+
+; mulpd
+define <2 x double> @slm-costs_mulpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 4 {{.*}} fmul <2 x double>
+  %res = fmul <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; mulps
+define <4 x float> @slm-costs_mulps(<4 x float> %a, <4 x float> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fmul <4 x float>
+  %res = fmul <4 x float> %a, %b
+  ret <4 x float> %res
+}
+
+; divss
+define float @slm-costs_divss(float %a, float %b)  {
+entry:
+; SLM:  cost of 17 {{.*}} fdiv float
+  %res = fdiv float %a, %b
+  ret float %res
+}
+
+; divps
+define <4 x float> @slm-costs_divps(<4 x float> %a, <4 x float> %b)  {
+entry:
+; SLM:  cost of 39 {{.*}} fdiv <4 x float>
+  %res = fdiv <4 x float> %a, %b
+  ret <4 x float> %res
+}
+
+; divsd
+define double @slm-costs_divsd(double %a, double %b)  {
+entry:
+; SLM:  cost of 32 {{.*}} fdiv double
+  %res = fdiv double %a, %b
+  ret double %res
+}
+
+; divpd
+define <2 x double> @slm-costs_divpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 69 {{.*}} fdiv <2 x double>
+  %res = fdiv <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; addpd
+define <2 x double> @slm-costs_addpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fadd <2 x double>
+  %res = fadd <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
+; subpd
+define <2 x double> @slm-costs_subpd(<2 x double> %a, <2 x double> %b)  {
+entry:
+; SLM:  cost of 2 {{.*}} fsub <2 x double>
+  %res = fsub <2 x double> %a, %b
+  ret <2 x double> %res
+}
+
diff --git a/test/Analysis/CostModel/X86/strided-load-i16.ll b/test/Analysis/CostModel/X86/strided-load-i16.ll
index 2c2cf3938bcb3..ef786e7509503 100755
--- a/test/Analysis/CostModel/X86/strided-load-i16.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i16.ll
@@ -1,113 +1,113 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i16] zeroinitializer, align 16
-@B = global [10240 x i16] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_i16_stride2() {
-;CHECK-LABEL: load_i16_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i16_stride3() {
-;CHECK-LABEL: load_i16_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i16_stride4() {
-;CHECK-LABEL: load_i16_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 2
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i16_stride5() {
-;CHECK-LABEL: load_i16_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 6 for VF 32 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 5
-  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
-  %1 = load i16, i16* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
-  store i16 %1, i16* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i16] zeroinitializer, align 16
+@B = global [10240 x i16] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i16_stride2() {
+;CHECK-LABEL: load_i16_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride3() {
+;CHECK-LABEL: load_i16_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride4() {
+;CHECK-LABEL: load_i16_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride5() {
+;CHECK-LABEL: load_i16_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/strided-load-i32.ll b/test/Analysis/CostModel/X86/strided-load-i32.ll
index 0dcd3929da7f2..fad9def2bf784 100755
--- a/test/Analysis/CostModel/X86/strided-load-i32.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i32.ll
@@ -1,110 +1,110 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i32] zeroinitializer, align 16
-@B = global [10240 x i32] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_int_stride2() {
-;CHECK-LABEL: load_int_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_int_stride3() {
-;CHECK-LABEL: load_int_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_int_stride4() {
-;CHECK-LABEL: load_int_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 2
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_int_stride5() {
-;CHECK-LABEL: load_int_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 5
-  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
-  %1 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
-  store i32 %1, i32* %arrayidx2, align 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_int_stride2() {
+;CHECK-LABEL: load_int_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride3() {
+;CHECK-LABEL: load_int_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride4() {
+;CHECK-LABEL: load_int_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride5() {
+;CHECK-LABEL: load_int_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/test/Analysis/CostModel/X86/strided-load-i64.ll b/test/Analysis/CostModel/X86/strided-load-i64.ll
index 0370b6f80efdb..a7f593017d603 100755
--- a/test/Analysis/CostModel/X86/strided-load-i64.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i64.ll
@@ -1,81 +1,81 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i64] zeroinitializer, align 16
-@B = global [10240 x i64] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_i64_stride2() {
-;CHECK-LABEL: load_i64_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
-  %1 = load i64, i64* %arrayidx, align 16
-  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
-  store i64 %1, i64* %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i64_stride3() {
-;CHECK-LABEL: load_i64_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
-  %1 = load i64, i64* %arrayidx, align 16
-  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
-  store i64 %1, i64* %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i64_stride4() {
-;CHECK-LABEL: load_i64_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 4
-  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
-  %1 = load i64, i64* %arrayidx, align 16
-  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
-  store i64 %1, i64* %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i64_stride2() {
+;CHECK-LABEL: load_i64_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride3() {
+;CHECK-LABEL: load_i64_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride4() {
+;CHECK-LABEL: load_i64_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 4
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/strided-load-i8.ll b/test/Analysis/CostModel/X86/strided-load-i8.ll
index 2a3a838641519..a97a32c5c9407 100755
--- a/test/Analysis/CostModel/X86/strided-load-i8.ll
+++ b/test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -1,117 +1,117 @@
-; REQUIRES: asserts
-; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = global [10240 x i8] zeroinitializer, align 16
-@B = global [10240 x i8] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define void @load_i8_stride2() {
-;CHECK-LABEL: load_i8_stride2
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i8_stride3() {
-;CHECK-LABEL: load_i8_stride3
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 3
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i8_stride4() {
-;CHECK-LABEL: load_i8_stride4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 2
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-define void @load_i8_stride5() {
-;CHECK-LABEL: load_i8_stride5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 3 for VF 4 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = mul nsw i64 %indvars.iv, 5
-  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
-  %1 = load i8, i8* %arrayidx, align 2
-  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
-  store i8 %1, i8* %arrayidx2, align 1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i8] zeroinitializer, align 16
+@B = global [10240 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i8_stride2() {
+;CHECK-LABEL: load_i8_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride3() {
+;CHECK-LABEL: load_i8_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride4() {
+;CHECK-LABEL: load_i8_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride5() {
+;CHECK-LABEL: load_i8_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index 6756f3ba2802e..13519018d9570 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -7,6 +7,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BWVL
 
 ; Verify the cost of vector arithmetic shift right instructions.
 
@@ -121,6 +124,8 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -146,6 +151,8 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 24 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 24 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -158,7 +165,9 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -283,6 +292,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i16> %a, %splat
@@ -322,7 +333,9 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = ashr <64 x i8> %a, %splat
@@ -440,6 +453,8 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -476,7 +491,9 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX: Found an estimated cost of 96 for instruction:   %shift
 ; AVX2: Found an estimated cost of 48 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 48 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -593,6 +610,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -632,6 +651,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 8 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 4 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 8 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 16 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 8 for instruction:   %shift
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 63e6db194d520..1e0fbce710ef8 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -7,6 +7,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BWVL
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -124,6 +127,8 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -160,7 +165,9 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -288,6 +295,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i16> %a, %splat
@@ -327,7 +336,9 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = lshr <64 x i8> %a, %splat
@@ -448,6 +459,8 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -484,7 +497,9 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 8 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -603,6 +618,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -642,6 +659,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 4 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 8c42bd66c707a..031d530dcd565 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -7,6 +7,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw,+avx512vl -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BWVL
 
 ; Verify the cost of vector shift left instructions.
 
@@ -161,7 +164,9 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -289,6 +294,8 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX2: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 20 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i16> %a, %splat
@@ -328,7 +335,9 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = shl <64 x i8> %a, %splat
@@ -450,6 +459,8 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -487,7 +498,9 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX: Found an estimated cost of 44 for instruction:   %shift
 ; AVX2: Found an estimated cost of 22 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 22 for instruction:   %shift
-; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 11 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -608,6 +621,8 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX2: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 2 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 1 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -647,6 +662,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512F: Found an estimated cost of 4 for instruction:   %shift
 ; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; AVX512VL: Found an estimated cost of 4 for instruction:   %shift
+; AVX512BWVL: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 4 for instruction:   %shift
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/test/Analysis/ScalarEvolution/max-trip-count.ll b/test/Analysis/ScalarEvolution/max-trip-count.ll
index 614e9b265ab09..d87e7d033a1e1 100644
--- a/test/Analysis/ScalarEvolution/max-trip-count.ll
+++ b/test/Analysis/ScalarEvolution/max-trip-count.ll
@@ -207,3 +207,84 @@ for.cond.i:                                       ; preds = %for.body.i
 bar.exit:                                         ; preds = %for.cond.i, %for.body.i
   ret i32 0
 }
+
+; CHECK-LABEL: @ne_max_trip_count_1
+; CHECK: Loop %for.body: max backedge-taken count is 7
+define i32 @ne_max_trip_count_1(i32 %n) {
+entry:
+  %masked = and i32 %n, 7
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %i, %masked
+  br i1 %cmp, label %for.body, label %bar.exit
+
+bar.exit:
+  ret i32 0
+}
+
+; CHECK-LABEL: @ne_max_trip_count_2
+; CHECK: Loop %for.body: max backedge-taken count is -1
+define i32 @ne_max_trip_count_2(i32 %n) {
+entry:
+  %masked = and i32 %n, 7
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %add, %masked
+  br i1 %cmp, label %for.body, label %bar.exit
+
+bar.exit:
+  ret i32 0
+}
+
+; CHECK-LABEL: @ne_max_trip_count_3
+; CHECK: Loop %for.body: max backedge-taken count is 6
+define i32 @ne_max_trip_count_3(i32 %n) {
+entry:
+  %masked = and i32 %n, 7
+  %guard = icmp eq i32 %masked, 0
+  br i1 %guard, label %exit, label %for.preheader
+
+for.preheader:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %add, %masked
+  br i1 %cmp, label %for.body, label %loop.exit
+
+loop.exit:
+  br label %exit
+
+exit:
+  ret i32 0
+}
+
+; CHECK-LABEL: @ne_max_trip_count_4
+; CHECK: Loop %for.body: max backedge-taken count is -2
+define i32 @ne_max_trip_count_4(i32 %n) {
+entry:
+  %guard = icmp eq i32 %n, 0
+  br i1 %guard, label %exit, label %for.preheader
+
+for.preheader:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %i, 1
+  %cmp = icmp ne i32 %add, %n
+  br i1 %cmp, label %for.body, label %loop.exit
+
+loop.exit:
+  br label %exit
+
+exit:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index e91a1a42c233c..8d9a8c06aa3c5 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -904,8 +904,9 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
 
 ; CHECK-LABEL: test_extracts_inserts_varidx_extract:
 ; CHECK: str q0
-; CHECK: add x[[PTR:[0-9]+]], {{.*}}, w0, sxtw #1
-; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], [x[[PTR]]]
+; CHECK-DAG: and [[MASKED_IDX:x[0-9]+]], x0, #0x7
+; CHECK: bfi [[PTR:x[0-9]+]], [[MASKED_IDX]], #1, #3
+; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], {{\[}}[[PTR]]{{\]}}
 ; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
 ; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
 ; CHECK-DAG: ins v[[R]].h[3], v0.h[3]
@@ -922,7 +923,9 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
 }
 
 ; CHECK-LABEL: test_extracts_inserts_varidx_insert:
-; CHECK: str h0, [{{.*}}, w0, sxtw #1]
+; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3
+; CHECK: bfi x9, [[MASKED_IDX]], #1, #2
+; CHECK: st1 { v0.h }[0], [x9]
 ; CHECK-DAG: ldr d[[R:[0-9]+]]
 ; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
 ; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
diff --git a/test/CodeGen/AArch64/arm64-nvcast.ll b/test/CodeGen/AArch64/arm64-nvcast.ll
index c3a1640ab0129..ba2512718c4e1 100644
--- a/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -1,10 +1,12 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
 
 ; CHECK-LABEL: _test:
-; CHECK:  fmov.2d v0, #2.00000000
-; CHECK:  str  q0, [sp, #-16]!
-; CHECK:  mov  x8, sp
-; CHECK:  ldr s0, [x8, w1, sxtw #2]
+; CHECK-DAG:  fmov.2d v0, #2.00000000
+; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
+; CHECK-DAG:  mov  x9, sp
+; CHECK-DAG:  str  q0, [sp], #16
+; CHECK-DAG:  bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
+; CHECK:  ldr s0, {{\[}}[[PTR]]{{\]}}
 ; CHECK:  str  s0, [x0]
 
 define void @test(float * %p1, i32 %v1) {
@@ -16,9 +18,11 @@ entry:
 
 ; CHECK-LABEL: _test2
 ; CHECK: movi.16b  v0, #63
-; CHECK: str  q0, [sp, #-16]!
-; CHECK: mov  x8, sp
-; CHECK: ldr s0, [x8, w1, sxtw #2]
+; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3
+; CHECK-DAG: str  q0, [sp], #16
+; CHECK-DAG: mov  x9, sp
+; CHECK-DAG:  bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2
+; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}}
 ; CHECK: str  s0, [x0]
 
 define void @test2(float * %p1, i32 %v1) {
diff --git a/test/CodeGen/AArch64/bitreverse.ll b/test/CodeGen/AArch64/bitreverse.ll
index 135bce3bdb6cc..85496ab032146 100644
--- a/test/CodeGen/AArch64/bitreverse.ll
+++ b/test/CodeGen/AArch64/bitreverse.ll
@@ -1,14 +1,18 @@
 ; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
 
-; These tests just check that the plumbing is in place for @llvm.bitreverse. The
-; actual output is massive at the moment as llvm.bitreverse is not yet legal.
+; These tests just check that the plumbing is in place for @llvm.bitreverse.
 
 declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
 
 define <2 x i16> @f(<2 x i16> %a) {
 ; CHECK-LABEL: f:
-; CHECK: rev32
-; CHECK: ushr
+; CHECK: fmov [[REG1:w[0-9]+]], s0
+; CHECK-DAG: rbit [[REG2:w[0-9]+]], [[REG1]]
+; CHECK-DAG: fmov s0, [[REG2]]
+; CHECK-DAG: mov [[REG3:w[0-9]+]], v0.s[1]
+; CHECK-DAG: rbit [[REG4:w[0-9]+]], [[REG3]]
+; CHECK-DAG: ins v0.s[1], [[REG4]]
+; CHECK-DAG: ushr v0.2s, v0.2s, #16
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %b
 }
@@ -17,26 +21,9 @@ declare i8 @llvm.bitreverse.i8(i8) readnone
 
 define i8 @g(i8 %a) {
 ; CHECK-LABEL: g:
-; CHECK-DAG: rev [[RV:w.*]], w0
-; CHECK-DAG: and [[L4:w.*]], [[RV]], #0xf0f0f0f
-; CHECK-DAG: and [[H4:w.*]], [[RV]], #0xf0f0f0f0
-; CHECK-DAG: lsr [[S4:w.*]], [[H4]], #4
-; CHECK-DAG: orr [[R4:w.*]], [[S4]], [[L4]], lsl #4
-
-; CHECK-DAG: and [[L2:w.*]], [[R4]], #0x33333333
-; CHECK-DAG: and [[H2:w.*]], [[R4]], #0xcccccccc
-; CHECK-DAG: lsr [[S2:w.*]], [[H2]], #2
-; CHECK-DAG: orr [[R2:w.*]], [[S2]], [[L2]], lsl #2
-
-; CHECK-DAG: mov [[P1:w.*]], #1426063360
-; CHECK-DAG: mov [[N1:w.*]], #-1442840576
-; CHECK-DAG: and [[L1:w.*]], [[R2]], [[P1]]
-; CHECK-DAG: and [[H1:w.*]], [[R2]], [[N1]]
-; CHECK-DAG: lsr [[S1:w.*]], [[H1]], #1
-; CHECK-DAG: orr [[R1:w.*]], [[S1]], [[L1]], lsl #1
-
-; CHECK-DAG: lsr w0, [[R1]], #24
-; CHECK-DAG: ret
+; CHECK: rbit [[REG:w[0-9]+]], w0
+; CHECK-NEXT: lsr w0, [[REG]], #24
+; CHECK-NEXT: ret
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %b
 }
diff --git a/test/CodeGen/AArch64/rbit.ll b/test/CodeGen/AArch64/rbit.ll
index 3404ae4b6beef..288a25bd65e3d 100644
--- a/test/CodeGen/AArch64/rbit.ll
+++ b/test/CodeGen/AArch64/rbit.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
 
+; The llvm.aarch64.rbit intrinsic should be auto-upgraded to the
+; target-independent bitreverse intrinsic.
+
 ; CHECK-LABEL: rbit32
 ; CHECK: rbit w0, w0
 define i32 @rbit32(i32 %t) {
@@ -18,3 +21,22 @@ entry:
 
 declare i64 @llvm.aarch64.rbit.i64(i64)
 declare i32 @llvm.aarch64.rbit.i32(i32)
+
+; CHECK-LABEL: rbit_generic32
+; CHECK: rbit w0, w0
+define i32 @rbit_generic32(i32 %t) {
+entry:
+  %rbit = call i32 @llvm.bitreverse.i32(i32 %t)
+  ret i32 %rbit
+}
+
+; CHECK-LABEL: rbit_generic64
+; CHECK: rbit x0, x0
+define i64 @rbit_generic64(i64 %t) {
+entry:
+  %rbit = call i64 @llvm.bitreverse.i64(i64 %t)
+  ret i64 %rbit
+}
+
+declare i32 @llvm.bitreverse.i32(i32) readnone
+declare i64 @llvm.bitreverse.i64(i64) readnone
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
new file mode 100644
index 0000000000000..34bb2588ad622
--- /dev/null
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -0,0 +1,858 @@
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
+--- |
+  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %and = and i32 %a, 1234567
+    store volatile i32 %and, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %and = and i32 %a, 1234567
+    store i32 %and, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %shl = shl i32 %a, 12
+    store volatile i32 %shl, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %shl = shl i32 %a, 12
+    store i32 %shl, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %ashr = ashr i32 %a, 12
+    store volatile i32 %ashr, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %ashr = ashr i32 %a, 12
+    store i32 %ashr, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %lshr = lshr i32 %a, 12
+    store volatile i32 %lshr, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %lshr = lshr i32 %a, 12
+    store i32 %lshr, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+  attributes #0 = { nounwind }
+  attributes #1 = { nounwind readnone }
+
+...
+---
+
+# GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}}
+# GCN: %10 = V_MOV_B32_e32 1543, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %10,
+name:            s_fold_and_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_64_xexec }
+  - { id: 2, class: sreg_32_xm0 }
+  - { id: 3, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_128 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %2 = COPY %1.sub1
+    %3 = COPY %1.sub0
+    %4 = S_MOV_B32 61440
+    %5 = S_MOV_B32 -1
+    %6 = REG_SEQUENCE killed %2, 1, killed %3, 2, killed %4, 3, killed %5, 4
+    %7 = S_MOV_B32 1234567
+    %8 = S_MOV_B32 9999
+    %9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc
+    %10 = COPY %9
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_and_imm_regimm_32{{$}}
+
+# GCN: %9 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %9,
+
+# GCN: %10 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %10
+
+# GCN: %11 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %11,
+
+# GCN: %12 = V_MOV_B32_e32 1234567, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %12,
+
+# GCN: %13 = V_MOV_B32_e32 63, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %13,
+
+name:            v_fold_and_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 20, class: sreg_32_xm0 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vgpr_32 }
+  - { id: 31, class: vgpr_32 }
+  - { id: 32, class: vreg_64 }
+  - { id: 33, class: vreg_64 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+  - { id: 36, class: vgpr_32 }
+  - { id: 37, class: vreg_64 }
+  - { id: 44, class: vgpr_32 }
+
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %31 = V_ASHRREV_I32_e64 31, %3, implicit %exec
+    %32 = REG_SEQUENCE %3, 1, %31, 2
+    %33 = V_LSHLREV_B64 2, killed %32, implicit %exec
+    %20 = COPY %4.sub1
+    %44 = V_ADD_I32_e32 %4.sub0, %33.sub0, implicit-def %vcc, implicit %exec
+    %36 = COPY killed %20
+    %35 = V_ADDC_U32_e32 %33.sub1, %36, implicit-def %vcc, implicit %vcc, implicit %exec
+    %37 = REG_SEQUENCE %44, 1, killed %35, 2
+    %24 = V_MOV_B32_e32 982, implicit %exec
+    %26 = S_MOV_B32 1234567
+    %34 = V_MOV_B32_e32 63, implicit %exec
+
+    %27 = V_AND_B32_e64 %26, %24, implicit %exec
+    FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_AND_B32_e64 %24, %26, implicit %exec
+    FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %29 = V_AND_B32_e32 %26, %24, implicit %exec
+    FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %30 = V_AND_B32_e64 %26, %26, implicit %exec
+    FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %31 = V_AND_B32_e64 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_shl_imm_regimm_32{{$}}
+# GC1: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %13,
+
+name:            s_fold_shl_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 1
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: v_fold_shl_imm_regimm_32{{$}}
+
+# GCN: %11 = V_MOV_B32_e32 40955904, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 0, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 2, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 7927808, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -8, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_shl_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_64 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 9999, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1, implicit %exec
+    %7 = S_MOV_B32 1
+    %27 = S_MOV_B32 -4
+
+    %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_LSHL_B32_e64 %7, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_LSHL_B32_e64 12, %7, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_LSHL_B32_e64 12, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_LSHL_B32_e64 %6, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_LSHL_B32_e64 %6, 32, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_LSHL_B32_e32 %6, %6, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_LSHL_B32_e32 %27, %6, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 243, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name:            s_fold_ashr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 999123
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: v_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_ashr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 999234234, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1000000, implicit %exec
+    %7 = S_MOV_B32 13424252
+    %8 = S_MOV_B32 4
+    %27 = S_MOV_B32 -4
+    %32 = S_MOV_B32 1
+    %33 = S_MOV_B32 3841
+    %34 = V_MOV_B32_e32 3841, implicit %exec
+    %35 = V_MOV_B32_e32 2, implicit %exec
+
+    %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_ASHR_I32_e64 %7, 3, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_ASHR_I32_e64 7, %32, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_ASHR_I32_e64 %27, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_ASHR_I32_e64 %6, 4, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_ASHR_I32_e64 %6, %33, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_ASHR_I32_e32 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_ASHR_I32_e32 %27, %35, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 1048332, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name:            s_fold_lshr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 -999123
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 1073741823, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_lshr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 999234234, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1000000, implicit %exec
+    %7 = S_MOV_B32 13424252
+    %8 = S_MOV_B32 4
+    %27 = S_MOV_B32 -4
+    %32 = S_MOV_B32 1
+    %33 = S_MOV_B32 3841
+    %34 = V_MOV_B32_e32 3841, implicit %exec
+    %35 = V_MOV_B32_e32 2, implicit %exec
+
+    %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_LSHR_B32_e64 %7, 3, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_LSHR_B32_e64 7, %32, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_LSHR_B32_e64 %27, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_LSHR_B32_e64 %6, 4, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_LSHR_B32_e64 %6, %33, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_LSHR_B32_e32 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_LSHR_B32_e32 %27, %35, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
new file mode 100644
index 0000000000000..b74bce76f79c4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -0,0 +1,262 @@
+; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+
+; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
+; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
+
+; FIXME: This should also fold when fma is actually fast if an FMA
+; exists in the original program.
+
+; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
+
+; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
+; GCN-FASTFMA: buffer_store_dword [[FMA1]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
+
+; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
+; GCN-FASTFMA: buffer_store_dword [[FMA1]]
+define void @fast_sub_fmuladd_fmul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fsub fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
+
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  store volatile float %mul.u.v, float addrspace(1)* undef
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
+
+; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  store volatile float %mul.u.v, float addrspace(1)* undef
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fadd fast float %z, %fma
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  store volatile float %fma, float addrspace(1)* undef
+  %add = fadd fast float %fma, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_mul_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_add_f32_e32
+define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  store volatile float %fma, float addrspace(1)* undef
+  %add = fadd fast float %z, %fma
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+
+; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]]
+; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+
+; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
+; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]]
+
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]]
+
+; GCN: buffer_store_dword [[MUL]]
+; GCN: buffer_store_dword [[MAD]]
+define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fsub fast float %fma, %z
+  store volatile float %mul.u.v, float addrspace(1)* undef
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[U:v[0-9]+]]
+; GCN: buffer_load_dword [[V:v[0-9]+]]
+
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]]
+
+; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]]
+; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
+; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
+
+; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
+; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
+; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
+; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
+
+; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
+; GCN-SLOWFMA: v_add_f32_e32
+; GCN-SLOWFMA: v_subrev_f32_e32
+define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %u = load volatile float, float addrspace(1)* undef
+  %v = load volatile float, float addrspace(1)* undef
+  %mul.u.v = fmul fast float %u, %v
+  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
+  %add = fsub fast float %fma, %z
+  store volatile float %fma, float addrspace(1)* undef
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 9df336c2c4895..10acae092e9f6 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -18,9 +18,9 @@ declare float @llvm.fabs.f32(float) #1
 ; VI: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
 ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
-; VI: v_add_f32_e32
-; VI: v_mul_f32_e32
-; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
+; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %a11 = fadd fast float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
@@ -113,9 +113,9 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; VI: v_add_f16_e64 v{{[0-9]+}}, s{{[0-9]+}}, -1.0
 ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
-; VI: v_add_f16_e32
-; VI: v_mul_f16_e32
-; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
+; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
new file mode 100644
index 0000000000000..d555d8d871de5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -0,0 +1,1282 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+
+; --------------------------------------------------------------------------------
+; fadd tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %add, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %add = fadd float %a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  %use1 = fmul float %add, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %add = fadd float %a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %add = fadd float %fneg.a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_A]]
+define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %add = fadd float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %add
+  %use1 = fmul float %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmul tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %mul, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = fmul float %mul, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = fmul float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = fmul float %a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = fmul float %fneg.a, %fneg.b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = fmul float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = fmul float %fneg.a, %b
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = fmul float %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fma tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fma, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  %use1 = fmul float %fma, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.c = fsub float -0.000000e+00, %c
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.c = fsub float -0.000000e+00, %c
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[FMA]]
+; GCN-NEXT: buffer_store_dword [[NEG_A]]
+define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  %use1 = fmul float %fneg.a, %d
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmad tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fmad_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
+; GCN-NEXT: buffer_store_dword [[NEG_C]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
+define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %fneg = fsub float -0.000000e+00, %fma
+  %use1 = fmul float %fma, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fp_extend tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpext = fpext float %a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store double %fneg, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpext = fpext float %fneg.a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store double %fneg, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+; GCN: buffer_store_dword [[FNEG_A]]
+define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpext = fpext float %fneg.a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store volatile double %fneg, double addrspace(1)* %out.gep
+  store volatile float %fneg.a, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
+define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpext = fpext float %a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  store volatile double %fneg, double addrspace(1)* %out.gep
+  store volatile double %fpext, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
+; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
+; GCN: buffer_store_dwordx2 [[MUL]]
+define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpext = fpext float %a to double
+  %fneg = fsub double -0.000000e+00, %fpext
+  %mul = fmul double %fpext, 4.0
+  store volatile double %fneg, double addrspace(1)* %out.gep
+  store volatile double %mul, double addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Source modifiers not folded for f16->f32
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
+define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile half, half addrspace(1)* %a.gep
+  %fpext = fpext half %a to float
+  %fneg = fsub float -0.000000e+00, %fpext
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %fpext, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
+define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile half, half addrspace(1)* %a.gep
+  %fpext = fpext half %a to float
+  %fneg = fsub float -0.000000e+00, %fpext
+  %mul = fmul float %fpext, 4.0
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fp_round tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fpround = fptrunc double %a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fneg.a = fsub double -0.000000e+00, %a
+  %fpround = fptrunc double %fneg.a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
+; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[NEG_A_LO:[0-9]+]], v[[A_LO]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[NEG_A_LO]]:[[NEG_A_HI]]{{\]}}
+define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fneg.a = fsub double -0.000000e+00, %a
+  %fpround = fptrunc double %fneg.a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile double %fneg.a, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dwordx2 [[USE1]]
+define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fneg.a = fsub double -0.000000e+00, %a
+  %fpround = fptrunc double %fneg.a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  %use1 = fmul double %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile double %use1, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_short [[RESULT]]
+define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fpround = fptrunc float %a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  store half %fneg, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_short [[RESULT]]
+define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpround = fptrunc float %fneg.a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  store half %fneg, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
+; GCN: buffer_store_dword [[NEG]]
+; GCN: buffer_store_dword [[CVT]]
+define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %fpround = fptrunc double %a to float
+  %fneg = fsub float -0.000000e+00, %fpround
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %fpround, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_short [[RESULT]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpround = fptrunc float %fneg.a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  store volatile half %fneg, half addrspace(1)* %out.gep
+  store volatile float %fneg.a, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
+; GCN: buffer_store_short [[RESULT]]
+; GCN: buffer_store_dword [[USE1]]
+define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fpround = fptrunc float %fneg.a to half
+  %fneg = fsub half -0.000000e+00, %fpround
+  %use1 = fmul float %fneg.a, %c
+  store volatile half %fneg, half addrspace(1)* %out.gep
+  store volatile float %use1, float addrspace(1)* undef
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; rcp tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rcp_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %fneg.a, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  %use1 = fmul float %fneg.a, %c
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  store volatile float %use1, float addrspace(1)* undef
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; rcp_legacy tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rcp_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %rcp = call float @llvm.amdgcn.rcp.legacy(float %a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmul_legacy tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %mul, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: buffer_store_dword [[ADD]]
+define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %fneg.b = fsub float -0.000000e+00, %b
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[NEG_A]]
+define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %fneg.a, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
+; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
+; GCN: buffer_store_dword [[MUL]]
+define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fneg.a = fsub float -0.000000e+00, %a
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
+  %fneg = fsub float -0.000000e+00, %mul
+  %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; sin tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_sin_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[K]], -[[A]]
+; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
+; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %sin = call float @llvm.sin.f32(float %a)
+  %fneg = fsub float -0.000000e+00, %sin
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %sin = call float @llvm.amdgcn.sin.f32(float %a)
+  %fneg = fsub float -0.000000e+00, %sin
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare float @llvm.sin.f32(float) #1
+
+declare float @llvm.amdgcn.sin.f32(float) #1
+declare float @llvm.amdgcn.rcp.f32(float) #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp.ll b/test/CodeGen/AMDGPU/fp16_to_fp.ll
deleted file mode 100644
index 5a79ca82bc29b..0000000000000
--- a/test/CodeGen/AMDGPU/fp16_to_fp.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
-declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16, i16 addrspace(1)* %in, align 2
-  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
-  ret void
-}
-
-
-; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]]
-; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
-; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
-; SI: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16, i16 addrspace(1)* %in, align 2
-  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
-  store double %cvt, double addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
new file mode 100644
index 0000000000000..35e9541692dba
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_convert_fp16_to_fp32:
+; GCN: buffer_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[RESULT]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RES:T[0-9]+\.[XYZW]]]
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]]
+; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]]
+; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
new file mode 100644
index 0000000000000..8b05d7b88a105
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_convert_fp16_to_fp64:
+; GCN: buffer_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
+; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 67925ebd82b65..346ad822f2935 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,12 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 
-; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
-; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_short [[RESULT]]
+; FUNC-LABEL: {{^}}test_convert_fp32_to_fp16:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_short [[RESULT]]
+
+; EG: MEM_RAT MSKOR
+; EG: VTX_READ_32
+; EG: FLT32_TO_FLT16
 define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 7351665f06e4d..2c538b16e743d 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -207,11 +207,15 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16>
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 
+; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}}
+; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}}
+; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}}
+
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
-; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: s_waitcnt
 
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
index d49fa2bf48a7d..2ef045dbb8eb2 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -7,11 +7,14 @@
 ;
 ; CHECK-LABEL: {{^}}main:
 
+; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
-; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
+; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
 ; TODO: add 0?
-; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
+; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
 
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index 0e6281940c248..d141281f36b87 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -273,12 +273,12 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 
 ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
 
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
 define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
@@ -306,15 +306,15 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
-; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
 
 ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
 ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
 
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
-; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
 
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
new file mode 100644
index 0000000000000..559d464f36a5b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %rcp = call float @llvm.amdgcn.rcp.legacy(float %x)
+  %fneg = fsub float -0.0, %rcp
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
+  %fneg = fsub float -0.0, %mul
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
new file mode 100644
index 0000000000000..d9d311cd032b1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -0,0 +1,840 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}add_select_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+define void @add_select_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add = fadd float %select, %z
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_lhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
+define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fabs.x, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_store_use_lhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 [[ADD:v[0-9]+]], |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+
+; GCN: buffer_store_dword [[ADD]]
+; GCN: buffer_store_dword [[X_ABS]]
+define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add0 = fadd float %select, %z
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %fabs.x, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_rhs_fabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
+define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fabs.x, float %fabs.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fabs.y, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_var_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_var_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float %fabs.x, float %y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_fabs_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float %fabs, float -1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; FIXME: fabs should fold away
+; GCN-LABEL: {{^}}add_select_fabs_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
+define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2.0, float -1.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  %add = fadd float %fabs, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_posk_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 2.0, float 1.0
+  %add = fadd float %select, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_negk_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float -1.0, float %fabs
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negliteralk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000
+
+; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
+define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float -1024.0, float %fabs
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
+define void @add_select_fabs_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float %fabs, float 1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
+define void @add_select_posk_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %select = select i1 %cmp, float 1.0, float %fabs
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_lhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
+define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fneg.x, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_store_use_lhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]]
+
+; GCN: buffer_store_dword [[ADD]]
+; GCN: buffer_store_dword [[NEG_X]]
+define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add0 = fadd float %select, %z
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %fneg.x, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_multi_use_rhs_fneg_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+; GCN: buffer_load_dword [[W:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
+define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %w = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %fneg.y = fsub float -0.0, %y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.y
+  %add0 = fadd float %select, %z
+  %add1 = fadd float %fneg.y, %w
+  store volatile float %add0, float addrspace(1)* undef
+  store volatile float %add1, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_var_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fneg_var_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float %y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float -1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float 0x3FC45F3060000000
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
+; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
+
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float 0xBFC45F3060000000
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cmp_eq_u32_e64
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_negk_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2.0, float -1.0
+  %add = fadd float %select, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negliteralk_negliteralk_f32:
+; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0xc5000000
+; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc5800000
+; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cmp_eq_u32_e64
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
+define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2048.0, float -4096.0
+  %add = fadd float %select, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_negk_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
+define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -2.0, float -1.0
+  %fneg.x = fsub float -0.0, %select
+  %add = fadd float %fneg.x, %x
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negk_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_negk_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float -1.0, float %fneg.x
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fneg_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_fneg_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float %fneg.x, float 1.0
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_posk_fneg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+define void @add_select_posk_fneg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.0, %x
+  %select = select i1 %cmp, float 1.0, float %fneg.x
+  %add = fadd float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negfabs_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fneg.fabs.x, float %fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]]
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
+  %select = select i1 %cmp, float %fabs.x, float %fneg.fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_neg_fabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_neg_fabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.000000e+00, %x
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %select = select i1 %cmp, float %fneg.x, float %fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_fabs_neg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
+define void @add_select_fabs_neg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.y = fsub float -0.000000e+00, %y
+  %select = select i1 %cmp, float %fabs.x, float %fneg.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_neg_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_neg_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fneg.x = fsub float -0.000000e+00, %x
+  %fabs.y = call float @llvm.fabs.f32(float %y)
+  %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
+  %select = select i1 %cmp, float %fneg.x, float %fneg.fabs.y
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_select_negfabs_neg_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
+; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
+define void @add_select_negfabs_neg_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %fneg.y = fsub float -0.000000e+00, %y
+  %select = select i1 %cmp, float %fneg.y, float %fneg.fabs.x
+  %add = fadd float %select, %z
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negfabs_posk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_cmp_eq_u32_e64 vcc,
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float %fneg.fabs.x, float 4.0
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_posk_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float 4.0, float %fneg.fabs.x
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negfabs_negk_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float %fneg.fabs.x, float -4.0
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_select_negk_negfabs_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e64 vcc
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
+  %select = select i1 %cmp, float -4.0, float %fneg.fabs.x
+  %add = fmul float %select, %y
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_add_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %add = fadd float %x, 4.0
+  %fneg = fsub float -0.0, %add
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_sub_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %add = fsub float %x, 4.0
+  %fneg = fsub float -0.0, %add
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %mul = fmul float %x, 4.0
+  %fneg = fsub float -0.0, %mul
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fma_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fma = call float @llvm.fma.f32(float %x, float 4.0, float %z)
+  %fneg = fsub float -0.0, %fma
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fmad_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fmad = call float @llvm.fmuladd.f32(float %x, float 4.0, float %z)
+  %fneg = fsub float -0.0, %fmad
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; FIXME: This one should fold to rcp
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %x)
+  %fneg = fsub float -0.0, %rcp
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare float @llvm.amdgcn.rcp.f32(float) #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll
new file mode 100644
index 0000000000000..ad358d33c4052
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-opt.ll
@@ -0,0 +1,161 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure to test with f32 and i32 compares. If we have to use float
+; compares, we always have multiple condition registers. If we can do
+; scalar compares, we don't want to use multiple condition registers.
+
+; GCN-LABEL: {{^}}opt_select_i32_and_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %and = and i1 %icmp0, %icmp1
+  %select = select i1 %and, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_and_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %and = and i1 %fcmp0, %fcmp1
+  %select = select i1 %and, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_and_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %and = and i1 %icmp0, %icmp1
+  %select = select i1 %and, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_and_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc,
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %and = and i1 %fcmp0, %fcmp1
+  %select = select i1 %and, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_or_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
+define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %or = or i1 %icmp0, %icmp1
+  %select = select i1 %or, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i32_or_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %or = or i1 %fcmp0, %fcmp1
+  %select = select i1 %or, i32 %x, i32 %y
+  store i32 %select, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_or_cmp_i32:
+; GCN-DAG: v_cmp_ne_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+  %icmp0 = icmp ne i32 %a, %b
+  %icmp1 = icmp ne i32 %a, %c
+  %or = or i1 %icmp0, %icmp1
+  %select = select i1 %or, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}opt_select_i64_or_cmp_f32:
+; GCN-DAG: v_cmp_lg_f32_e32 vcc,
+; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+  %fcmp0 = fcmp one float %a, %b
+  %fcmp1 = fcmp one float %a, %c
+  %or = or i1 %fcmp0, %fcmp1
+  %select = select i1 %or, i64 %x, i64 %y
+  store i64 %select, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}regression:
+; GCN: v_cmp_neq_f32_e64 vcc
+; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+
+define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+entry:
+  %cmp0 = fcmp oeq float %c0, 1.0
+  br i1 %cmp0, label %if0, label %endif
+
+if0:
+  %cmp1 = fcmp oeq float %c1, 0.0
+  br i1 %cmp1, label %if1, label %endif
+
+if1:
+  %cmp2 = xor i1 %cmp1, true
+  br label %endif
+
+endif:
+  %tmp0 = phi i1 [ true, %entry ], [ %cmp2, %if1 ], [ false, %if0 ]
+  %tmp2 = select i1 %tmp0, float 4.0, float 0.0
+  store float %tmp2, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index f9216d311471f..84f9b7bb80648 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+; FIXME: i16 promotion pass ruins the scalar cases when legal.
+
 ; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
 ; GCN: s_load_dword [[ARG:s[0-9]+]],
 ; GCN: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
@@ -659,6 +661,137 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+  %ld = load i32, i32 addrspace(2)* %ptr
+  %in = trunc i32 %ld to i16
+  %shl = shl i16 %in, 15
+  %sext = ashr i16 %shl, 15
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i2_i16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x20000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+  %ld = load i32, i32 addrspace(2)* %ptr
+  %in = trunc i32 %ld to i16
+  %shl = shl i16 %in, 14
+  %sext = ashr i16 %shl, 14
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_i16:
+; GCN: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]]
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}}
+
+; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
+define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid
+  %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
+
+  %in = load i16, i16 addrspace(1)* %gep
+  %shl = shl i16 %in, 15
+  %sext = ashr i16 %shl, 15
+  store i16 %sext, i16 addrspace(3)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_i16_nonload:
+; GCN: {{buffer|flat}}_load_ushort [[VAL0:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[VAL1:v[0-9]+]]
+
+; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+; VI: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
+; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
+define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+  %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
+  %a = load volatile i16, i16 addrspace(1)* %a.gep, align 2
+  %b = load volatile i16, i16 addrspace(1)* %b.gep, align 2
+
+  %c = shl i16 %a, %b
+  %shl = shl i16 %c, 15
+  %ashr = ashr i16 %shl, 15
+
+  store i16 %ashr, i16 addrspace(3)* %out.gep, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i2_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x20000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+  %shl = shl i16 %in, 14
+  %sext = ashr i16 %shl, 14
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i8_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_sext_i32_i8 [[SSEXT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+  %shl = shl i16 %in, 8
+  %sext = ashr i16 %shl, 8
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sext_in_reg_i15_i16_arg:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+
+; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0xf0000
+; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; SI: buffer_store_short [[VBFE]]
+
+; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+  %shl = shl i16 %in, 1
+  %sext = ashr i16 %shl, 1
+  store i16 %sext, i16 addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.r600.read.tidig.x() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
new file mode 100644
index 0000000000000..1988a14b58453
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -0,0 +1,597 @@
+# RUN: llc -verify-machineinstrs -march=amdgcn -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+# Check that add with carry out isn't incorrectly reduced to e32 when
+# the carry out is a virtual register.
+
+# TODO: We should run this test until the end of codegen to make sure
+# that the post-RA run does manage to shrink it, but right now the
+# resume crashes
+
+--- |
+  define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = sub i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = sub i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %tid.ext = sext i32 %tid to i64
+    %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+    %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1
+    %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext
+    %a = load volatile i32, i32 addrspace(1)* %a.ptr
+    %b = load volatile i32, i32 addrspace(1)* %b.ptr
+    %result = add i32 %a, %b
+    store volatile i32 %result, i32 addrspace(1)* %out.gep
+    ret void
+  }
+
+  declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+  attributes #0 = { nounwind }
+  attributes #1 = { nounwind readnone }
+
+...
+---
+# GCN-LABEL: name: shrink_add_vop3{{$}}
+# GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+name:            shrink_add_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_sub_vop3{{$}}
+# GCN: %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+
+name:            shrink_sub_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_subrev_vop3{{$}}
+# GCN: %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+
+name:            shrink_subrev_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: check_addc_src2_vop3{{$}}
+# GCN: %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+name: check_addc_src2_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %9 = S_MOV_B64 0
+    %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: shrink_addc_vop3{{$}}
+# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+# GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+
+name:            shrink_addc_vop3
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %vcc = S_MOV_B64 0
+    %29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
+
+---
+# GCN-LABEL: name: shrink_addc_undef_vcc{{$}}
+# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec
+# GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+name:            shrink_addc_undef_vcc
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32 }
+  - { id: 23, class: sreg_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %27 = REG_SEQUENCE %3, 1, %26, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %28 = V_LSHL_B64 killed %27, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr)
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr)
+    %29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec
+    %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 027c63817903e..16aed5928b0ac 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -212,5 +212,71 @@ entry:
   ret void
 }
 
+; Without special casing the inline constant check for v_mac_f32's
+; src2, this fails to fold the 1.0 into a mad.
+
+; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext
+  %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext
+  %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %tmp = load volatile float, float addrspace(1)* %gep.a
+  %tmp1 = load volatile float, float addrspace(1)* %gep.b
+  %tmp2 = fadd float %tmp, %tmp
+  %tmp3 = fmul float %tmp2, 4.0
+  %tmp4 = fsub float 1.0, %tmp3
+  %tmp5 = fadd float %tmp4, %tmp1
+  %tmp6 = fadd float %tmp1, %tmp1
+  %tmp7 = fmul float %tmp6, %tmp
+  %tmp8 = fsub float 1.0, %tmp7
+  %tmp9 = fmul float %tmp8, 8.0
+  %tmp10 = fadd float %tmp5, %tmp9
+  store float %tmp10, float addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]]
+
+; FIXME: How is this not folded?
+; SI: v_cvt_f32_f16_e32 v{{[0-9]+}}, 0x3c00
+
+; VI: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
+; VI: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
+define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
+  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
+  %gep.out = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
+  %tmp = load volatile half, half addrspace(1)* %gep.a
+  %tmp1 = load volatile half, half addrspace(1)* %gep.b
+  %tmp2 = fadd half %tmp, %tmp
+  %tmp3 = fmul half %tmp2, 4.0
+  %tmp4 = fsub half 1.0, %tmp3
+  %tmp5 = fadd half %tmp4, %tmp1
+  %tmp6 = fadd half %tmp1, %tmp1
+  %tmp7 = fmul half %tmp6, %tmp
+  %tmp8 = fsub half 1.0, %tmp7
+  %tmp9 = fmul half %tmp8, 8.0
+  %tmp10 = fadd half %tmp5, %tmp9
+  store half %tmp10, half addrspace(1)* %gep.out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
 attributes #0 = { nounwind "unsafe-fp-math"="false" }
 attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index ebc5934df022b..8241236872873 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -825,7 +825,7 @@ define void @test_fmuladd(half* %p, half* %q, half* %r) #0 {
 ; CHECK-ALL: strh
 ; CHECK-ALL: mov
 ; CHECK-ALL-DAG: ldrh
-; CHECK-ALL-DAG: add
+; CHECK-ALL-DAG: orr
 ; CHECK-ALL: strh
 ; CHECK-ALL: ldrh
 ; CHECK-ALL: strh
@@ -855,7 +855,7 @@ define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 {
 ; CHECK-VFP: orr
 ; CHECK-VFP: str
 ; CHECK-VFP: mov
-; CHECK-VFP: add
+; CHECK-VFP: orr
 ; CHECK-VFP: ldrh
 ; CHECK-VFP: strh
 ; CHECK-VFP: add sp, sp, #8
diff --git a/test/CodeGen/ARM/fpcmp_ueq.ll b/test/CodeGen/ARM/fpcmp_ueq.ll
index ba14140cdc447..c1696c9be1b7c 100644
--- a/test/CodeGen/ARM/fpcmp_ueq.ll
+++ b/test/CodeGen/ARM/fpcmp_ueq.ll
@@ -9,7 +9,11 @@ entry:
 }
 
 ; CHECK-ARMv4-LABEL: f7:
-; CHECK-ARMv4: moveq r6, #1
+; CHECK-ARMv4-DAG: bl ___eqsf2
+; CHECK-ARMv4-DAG: bl ___unordsf2
+; CHECK-ARMv4: cmp r0, #0
+; CHECK-ARMv4: movne r0, #1
+; CHECK-ARMv4: orrs r0, r0,
 ; CHECK-ARMv4: moveq r0, #42
 
 ; CHECK-ARMv7-LABEL: f7:
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 25c4807d9862b..b7693c7976357 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -373,7 +373,8 @@ define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
 ; CHECK: mov r[[FP:[0-9]+]], sp
 ; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4]
 ; CHECK: mov r[[SPCOPY:[0-9]+]], sp
-; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[IDX]]
+; CHECK: and r[[MASKED_IDX:[0-9]+]], r[[IDX]], #15
+; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[MASKED_IDX]]
 ; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]]
   %x = extractelement <16 x i8> %v, i32 %idx
   %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index 269223ac9f386..1aa23597cf499 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -214,14 +214,11 @@ define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
 }
 
 ; Combine vuzp+vadd->vpadd.
-; FIXME: Implement this optimization
-define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
-; CHECK-LABEL: addCombineToVPADD:
+define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i8:
 ; CHECK:       @ BB#0:
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vorr d18, d17, d17
-; CHECK-NEXT:    vuzp.8 d16, d18
-; CHECK-NEXT:    vadd.i8 d16, d18, d16
+; CHECK-NEXT:    vpadd.i8 d16, d16, d17
 ; CHECK-NEXT:    vstr d16, [r1]
 ; CHECK-NEXT:    mov pc, lr
   %tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -233,15 +230,44 @@ define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
   ret void
 }
 
+; Combine vuzp+vadd->vpadd.
+define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i16:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpadd.i16 d16, d16, d17
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <8 x i16>, <8 x i16>* %cbcr
+  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %add = add <4 x i16> %tmp3, %tmp1
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vtrn+vadd->vpadd.
+define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADD_i32:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpadd.i32 d16, d16, d17
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <4 x i32>, <4 x i32>* %cbcr
+  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %add = add <2 x i32> %tmp3, %tmp1
+  store <2 x i32> %add, <2 x i32>* %X, align 8
+  ret void
+}
+
 ; Combine vuzp+vaddl->vpaddl
-; FIXME: Implement this optimization.
-define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
-; CHECK-LABEL: addCombineToVPADDL_sext:
+define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s8:
 ; CHECK:       @ BB#0:
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    vorr d18, d17, d17
-; CHECK-NEXT:    vuzp.8 d16, d18
-; CHECK-NEXT:    vaddl.s8 q8, d18, d16
+; CHECK-NEXT:    vpaddl.s8 q8, q8
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
 ; CHECK-NEXT:    mov pc, lr
   %tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -254,10 +280,200 @@ define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind s
   ret void
 }
 
-; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
-; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
-define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
-; CHECK-LABEL: fromExtendingExtractVectorElt:
+; Combine vuzp+vaddl->vpaddl
+; FIXME: Legalization butchers the shuffles.
+define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_s8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i16 d18, #0x8
+; CHECK-NEXT:    vneg.s16 d18, d18
+; CHECK-NEXT:    vext.8 d19, d16, d16, #1
+; CHECK-NEXT:    vshl.i16 d16, d16, #8
+; CHECK-NEXT:    vshl.i16 d17, d19, #8
+; CHECK-NEXT:    vshl.s16 d16, d16, d18
+; CHECK-NEXT:    vshl.s16 d17, d17, d18
+; CHECK-NEXT:    vadd.i16 d16, d17, d16
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
+  %tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
+  %add = add <4 x i16> %tmp4, %tmp5
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
+  %add = add <8 x i16> %tmp4, %tmp5
+  store <8 x i16> %add, <8 x i16>* %X, align 8
+  ret void
+}
+
+; In theory, it's possible to match this to vpaddl, but rearranging the
+; shuffle is awkward, so this doesn't match at the moment.
+define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmovl.u8 q9, d17
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vuzp.16 q8, q9
+; CHECK-NEXT:    vadd.i16 q8, q8, q9
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
+  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add <8 x i16> %tmp2, %tmp3
+  store <8 x i16> %add, <8 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+; FIXME: Legalization butchers the shuffle.
+define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_u8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vext.8 d18, d16, d16, #1
+; CHECK-NEXT:    vbic.i16 d16, #0xff00
+; CHECK-NEXT:    vbic.i16 d18, #0xff00
+; CHECK-NEXT:    vadd.i16 d16, d18, d16
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+  %tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
+  %add = add <4 x i16> %tmp4, %tmp5
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Matching to vpaddl.8 requires matching shuffle(zext()).
+define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vpadd.i16 d16, d16, d17
+; CHECK-NEXT:    vstr d16, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <16 x i8>, <16 x i8>* %cbcr
+  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
+  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %add = add <4 x i16> %tmp2, %tmp3
+  store <4 x i16> %add, <4 x i16>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s16:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.s16 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <8 x i16>, <8 x i16>* %cbcr
+  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
+  %add = add <4 x i32> %tmp4, %tmp5
+  store <4 x i32> %add, <4 x i32>* %X, align 8
+  ret void
+}
+
+; Combine vuzp+vaddl->vpaddl
+define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u16:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <8 x i16>, <8 x i16>* %cbcr
+  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
+  %add = add <4 x i32> %tmp4, %tmp5
+  store <4 x i32> %add, <4 x i32>* %X, align 8
+  ret void
+}
+
+; Combine vtrn+vaddl->vpaddl
+define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_s32:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.s32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <4 x i32>, <4 x i32>* %cbcr
+  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
+  %add = add <2 x i64> %tmp4, %tmp5
+  store <2 x i64> %add, <2 x i64>* %X, align 8
+  ret void
+}
+
+; Combine vtrn+vaddl->vpaddl
+define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
+; CHECK-LABEL: addCombineToVPADDLq_u32:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    mov pc, lr
+  %tmp = load <4 x i32>, <4 x i32>* %cbcr
+  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
+  %add = add <2 x i64> %tmp4, %tmp5
+  store <2 x i64> %add, <2 x i64>* %X, align 8
+  ret void
+}
+
+; Legalization promotes the <4 x i8> to <4 x i16>.
+define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vpaddl.s8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
+  %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %x = add <4 x i8> %tmp2, %tmp1
+  ret <4 x i8> %x
+}
+
+; Legalization promotes the <2 x i16> to <2 x i32>.
+define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
+; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
 ; CHECK:       @ BB#0:
 ; CHECK-NEXT:    vmov d16, r0, r1
 ; CHECK-NEXT:    vpaddl.s16 d16, d16
diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll
index 36731e933bab4..df6336043fdfb 100644
--- a/test/CodeGen/ARM/vtrn.ll
+++ b/test/CodeGen/ARM/vtrn.ll
@@ -70,14 +70,14 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ; CHECK-NEXT:    vldr d16, [r1]
 ; CHECK-NEXT:    vldr d17, [r0]
 ; CHECK-NEXT:    vtrn.32 d17, d16
-; CHECK-NEXT:    vadd.i32 d16, d17, d16
+; CHECK-NEXT:    vmul.i32 d16, d17, d16
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = load <2 x i32>, <2 x i32>* %B
 	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
 	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        %tmp5 = mul <2 x i32> %tmp3, %tmp4
 	ret <2 x i32> %tmp5
 }
 
diff --git a/test/CodeGen/Mips/llvm-ir/extractelement.ll b/test/CodeGen/Mips/llvm-ir/extractelement.ll
index 1e1b02df99a27..3c7df4a5e99fc 100644
--- a/test/CodeGen/Mips/llvm-ir/extractelement.ll
+++ b/test/CodeGen/Mips/llvm-ir/extractelement.ll
@@ -14,6 +14,7 @@ define i1 @via_stack_bug(i8 signext %idx) {
 ; ALL-DAG:       addiu  [[ONE:\$[0-9]+]], $zero, 1
 ; ALL-DAG:       sb     [[ONE]], 7($sp)
 ; ALL-DAG:       sb     $zero, 6($sp)
+; ALL-DAG:       andi   [[MASKED_IDX:\$[0-9]+]], $4, 1
 ; ALL-DAG:       addiu  [[VPTR:\$[0-9]+]], $sp, 6
-; ALL-DAG:       addu   [[EPTR:\$[0-9]+]], $4, [[VPTR]]
+; ALL-DAG:       or   [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]]
 ; ALL:           lbu    $2, 0([[EPTR]])
diff --git a/test/CodeGen/Mips/msa/immediates-bad.ll b/test/CodeGen/Mips/msa/immediates-bad.ll
new file mode 100644
index 0000000000000..c6b8fcef649a4
--- /dev/null
+++ b/test/CodeGen/Mips/msa/immediates-bad.ll
@@ -0,0 +1,1681 @@
+; RUN: not llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s 2> %t1
+; RUN: FileCheck %s < %t1
+
+; Test that the immediate intrinsics with out of range values trigger an error.
+
+
+define void @binsli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+; CHECK: LLVM ERROR: Immediate out of range
+
+define void @binsri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %a, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmnzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 9)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 152)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 129)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_b(<16 x i8> * %ptr) {
+entry:
+  %r = call <16 x i8> @llvm.mips.ldi.b(i32 1025)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 163)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @nori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 63)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 7)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @slli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @splati_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srai_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srari_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 65)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @addvi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_w(<4 x i32> * %ptr) {
+entry:
+  %r = call <4 x i32> @llvm.mips.ldi.w(i32 1024)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 63)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @slli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @splati_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srai_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srari_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 65)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @addvi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 16)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %a, i32 17)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %a, i32 19)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 19)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 19)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 130)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 130)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 63)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 130)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_h(<8 x i16> * %ptr) {
+entry:
+  %r = call <8 x i16> @llvm.mips.ldi.h(i32 1024)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @slli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @splati_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srai_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srari_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 65)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define i32 @copy_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 17)
+  ret i32 %r
+}
+
+
+define i32 @copy_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 9)
+  ret i32 %r
+}
+
+
+define i32 @copy_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 5)
+  ret i32 %r
+}
+
+
+define i32 @copy_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 16)
+  ret i32 %r
+}
+
+
+define i32 @copy_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 9)
+  ret i32 %r
+}
+
+
+define i32 @copy_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 5)
+  ret i32 %r
+}
+
+define i64 @copy_s_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 3)
+  ret i64 %r
+}
+
+define i64 @copy_u_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 3)
+  ret i64 %r
+}
+
+define void @addvi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 64)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_d(<2 x i64> * %ptr) {
+entry:
+  %r = call <2 x i64> @llvm.mips.ldi.d(i32 1024)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 63)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 1)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @slli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srai_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srari_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 65)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}; Negative numbers
+
+
+define void @neg_addvi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_andi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.andi.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %a, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bmnzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bmzi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 -5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 -120)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 -35)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_b(<16 x i8> * %ptr) {
+entry:
+  %r = call <16 x i8> @llvm.mips.ldi.b(i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_nori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ori_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 -25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 -7)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_splati_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 -3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @neg_addvi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 -140)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 -150)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 -25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 -200)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_w(<4 x i32> * %ptr) {
+entry:
+  %r = call <4 x i32> @llvm.mips.ldi.w(i32 -300)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 -20)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_splati_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 -3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @neg_addvi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 -8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %a, i32 -8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %a, i32 -15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 -14)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 -15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 -150)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 -25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 -200)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 -200)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 -200)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 -2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_h(<8 x i16> * %ptr) {
+entry:
+  %r = call <8 x i16> @llvm.mips.ldi.h(i32 -300)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_splati_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 -3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define i32 @neg_copy_s_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 -1)
+  ret i32 %r
+}
+
+define i32 @neg_copy_s_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 -1)
+  ret i32 %r
+}
+
+define i32 @neg_copy_s_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 -1)
+  ret i32 %r
+}
+
+define i32 @neg_copy_u_b(<16 x i8> * %ptr) {
+entry:
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 -1)
+  ret i32 %r
+}
+
+
+define i32 @neg_copy_u_h(<8 x i16> * %ptr) {
+entry:
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 -1)
+  ret i32 %r
+}
+
+
+define i32 @neg_copy_u_w(<4 x i32> * %ptr) {
+entry:
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 -1)
+  ret i32 %r
+}
+
+define i64 @neg_copy_s_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 -1)
+  ret i64 %r
+}
+
+define i64 @neg_copy_u_d(<2 x i64> * %ptr) {
+entry:  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 -1)
+  ret i64 %r
+}
+
+define void @neg_addvi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bclri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_binsri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bnegi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_bseti_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 -45)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clei_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 -32)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_clti_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 -25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_ldi_d(<2 x i64> * %ptr) {
+entry:
+  %r = call <2 x i64> @llvm.mips.ldi.d(i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 -202)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_maxi_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 -2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_s_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 -202)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_mini_u_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 -2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_sldi_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 -1)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_slli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srai_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srari_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srli_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @neg_srlri_d(<2 x i64> * %ptr) {
+entry:
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 -3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ldi.h(i32)
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32)
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.ldi.w(i32)
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32)
+declare <2 x i64> @llvm.mips.ldi.d(i32)
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32)
+declare <16 x i8> @llvm.mips.ldi.b(i32)
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32)
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32)
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32)
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32)
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32)
diff --git a/test/CodeGen/Mips/msa/immediates.ll b/test/CodeGen/Mips/msa/immediates.ll
new file mode 100644
index 0000000000000..b561ace30a8a1
--- /dev/null
+++ b/test/CodeGen/Mips/msa/immediates.ll
@@ -0,0 +1,1276 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,MSA32
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic -target-abi n32 < %s \
+; RUN:      | FileCheck %s -check-prefixes=CHECK,MSA64,MSA64N32
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic -target-abi n64 < %s \
+; RUN:      | FileCheck %s -check-prefixes=CHECK,MSA64,MSA64N64
+
+; Test that the immediate intrinsics don't crash LLVM.
+
+; Some of the intrinsics lower to equivalent forms.
+
+define void @addvi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_b:
+; CHECK: addvi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @andi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: andi_b:
+; CHECK: andi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.andi.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_b:
+; CHECK: andi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_b:
+; CHECK: binsli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %a, <16 x i8> %b, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_b:
+; CHECK: binsri.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %a, <16 x i8> %b, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmnzi_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: bmnzi_b:
+; CHECK: bmnzi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %a, <16 x i8> %b, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bmzi_b(<16 x i8> * %ptr, <16 x i8> * %ptr2) {
+entry:
+; CHECK-LABEL: bmzi_b:
+; CHECK: bmnzi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %b = load <16 x i8>, <16 x i8> * %ptr2, align 16
+  %r = call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %a, <16 x i8> %b, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_b:
+; CHECK: bnegi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %a, i32 6)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bseli_b:
+; CHECK: bseli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %a, <16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_b:
+; CHECK: bseti.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %a, i32 5)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_b:
+; CHECK: clei_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %a, i32 12)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_b:
+; CHECK: clei_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_b:
+; CHECK: clti_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %a, i32 15)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_b:
+; CHECK: clti_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_b:
+; CHECK: ldi.b
+  %r = call <16 x i8> @llvm.mips.ldi.b(i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_b:
+; CHECK: maxi_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_b:
+; CHECK: maxi_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_b:
+; CHECK: mini_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_b:
+; CHECK: mini_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %a, i32 2)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @nori_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: nori_b:
+; CHECK: nori.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.nori.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ori_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: ori_b:
+; CHECK: ori.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.ori.b(<16 x i8> %a, i32 25)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_b:
+; CHECK: sldi.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %a, <16 x i8> %a, i32 7)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @slli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: slli_b:
+; CHECK: slli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.slli.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @splati_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: splati_b:
+; CHECK: splati.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.splati.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srai_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srai_b:
+; CHECK: srai.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srai.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srari_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srari_b:
+; CHECK: srari.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srari.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srli_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srli_b:
+; CHECK: srli.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srli.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_b:
+; CHECK: srlri.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %a, i32 3)
+  store <16 x i8> %r, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @ld_b(<16 x i8> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_b
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+
+; MSA64N32-DAG: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+
+; CHECK:    ld.b $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <16 x i8> @llvm.mips.ld.b(i8* %ldptr, i32 %offset)
+  store <16 x i8> %a, <16 x i8> * %ptr, align 16
+  ret void
+}
+
+define void @st_b(<16 x i8> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_b
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32: sll $[[R1:[0-9]]], $6, 0
+; MSA64N32: sll $[[R2:[0-9]]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.b $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <16 x i8> @llvm.mips.ld.b(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.b(<16 x i8> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+define void @addvi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_w:
+; CHECK: addvi.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_w:
+; CHECK: bclri.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_w(<4 x i32> * %ptr, <4 x i32> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_w:
+; CHECK: binsli.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %b = load <4 x i32>, <4 x i32> * %ptr2, align 16
+  %r = call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %a, <4 x i32> %b, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_w(<4 x i32> * %ptr, <4 x i32> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_w:
+; CHECK: binsri.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %b = load <4 x i32>, <4 x i32> * %ptr2, align 16
+  %r = call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %a, <4 x i32> %b, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_w:
+; CHECK: bnegi.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_w:
+; CHECK: bseti.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_w:
+; CHECK: clei_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %a, i32 14)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_w:
+; CHECK: clei_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_w:
+; CHECK: clti_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %a, i32 15)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_w:
+; CHECK: clti_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %a, i32 25)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_w:
+; CHECK: maxi_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_w:
+; CHECK: maxi_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_w:
+; CHECK: mini_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_w:
+; CHECK: mini_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_w:
+; CHECK: ldi.w
+  %r = call <4 x i32> @llvm.mips.ldi.w(i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_w:
+; CHECK: sldi.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %a, <4 x i32> %a, i32 2)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @slli_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: slli_w:
+; CHECK: slli.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.slli.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @splati_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: splati_w:
+; CHECK: splati.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.splati.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srai_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srai_w:
+; CHECK: srai.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srai.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srari_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srari_w:
+; CHECK: srari.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srari.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srli_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srli_w:
+; CHECK: srli.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srli.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_w:
+; CHECK: srlri.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %a, i32 3)
+  store <4 x i32> %r, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @ld_w(<4 x i32> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_w
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+; MSA64N32: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+; CHECK: ld.w $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <4 x i32> @llvm.mips.ld.w(i8* %ldptr, i32 %offset)
+  store <4 x i32> %a, <4 x i32> * %ptr, align 16
+  ret void
+}
+
+define void @st_w(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_w
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32: sll $[[R1:[0-9]+]], $6, 0
+; MSA64N32: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.w $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <4 x i32> @llvm.mips.ld.w(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.w(<4 x i32> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+define void @addvi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_h:
+; CHECK: addvi.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %a, i32 25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_h:
+; CHECK: bclri.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %a, i32 8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_h(<8 x i16> * %ptr, <8 x i16> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_h:
+; CHECK: binsli.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %b = load <8 x i16>, <8 x i16> * %ptr2, align 16
+  %r = call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %a, <8 x i16> %b, i32 8)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_h(<8 x i16> * %ptr, <8 x i16> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_h:
+; CHECK: binsri.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %b = load <8 x i16>, <8 x i16> * %ptr2, align 16
+  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %b, i32 15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_h:
+; CHECK: bnegi.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %a, i32 14)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_h:
+; CHECK: bseti.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %a, i32 15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_h:
+; CHECK: clei_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %a, i32 13)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_h:
+; CHECK: clei_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %a, i32 25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_h:
+; CHECK: clti_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %a, i32 15)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_h:
+; CHECK: clti_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %a, i32 25)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_h:
+; CHECK: maxi_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_h:
+; CHECK: maxi_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_h:
+; CHECK: mini_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_h:
+; CHECK: mini_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %a, i32 2)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_h:
+; CHECK: ldi.h
+  %r = call <8 x i16> @llvm.mips.ldi.h(i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_h:
+; CHECK: sldi.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %a, <8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @slli_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: slli_h:
+; CHECK: slli.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.slli.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @splati_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: splati_h:
+; CHECK: splati.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.splati.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srai_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srai_h:
+; CHECK: srai.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srai.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srari_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srari_h:
+; CHECK: srari.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srari.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srli_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srli_h:
+; CHECK: srli.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srli.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_h:
+; CHECK: srlri.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %a, i32 3)
+  store <8 x i16> %r, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @ld_h(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_h
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+
+; MSA64N32-DAG: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+
+; CHECK:    ld.h $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <8 x i16> @llvm.mips.ld.h(i8* %ldptr, i32 %offset)
+  store <8 x i16> %a, <8 x i16> * %ptr, align 16
+  ret void
+}
+
+define void @st_h(<8 x i16> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_h
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32-DAG: sll $[[R1:[0-9]+]], $6, 0
+; MSA64N32-DAG: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.h $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <8 x i16> @llvm.mips.ld.h(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.h(<8 x i16> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+define i32 @copy_s_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_b:
+; CHECK: copy_s.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.b(<16 x i8> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_s_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_h:
+; CHECK: copy_s.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.h(<8 x i16> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_s_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_w:
+; CHECK: copy_s.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.s.w(<4 x i32> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_u_b(<16 x i8> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_b:
+; CHECK: copy_u.b
+  %a = load <16 x i8>, <16 x i8> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.b(<16 x i8> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_u_h(<8 x i16> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_h:
+; CHECK: copy_u.h
+  %a = load <8 x i16>, <8 x i16> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.h(<8 x i16> %a, i32 1)
+  ret i32 %r
+}
+define i32 @copy_u_w(<4 x i32> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_w:
+; MSA32: copy_s.w
+; MSA64: copy_u.w
+  %a = load <4 x i32>, <4 x i32> * %ptr, align 16
+  %r = call i32 @llvm.mips.copy.u.w(<4 x i32> %a, i32 1)
+  ret i32 %r
+}
+
+define i64 @copy_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: copy_s_d:
+; MSA32: copy_s.w
+; MSA32: copy_s.w
+; MSA64: copy_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.s.d(<2 x i64> %a, i32 1)
+  ret i64 %r
+}
+
+define i64 @copy_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: copy_u_d:
+; MSA32: copy_s.w
+; MSA32: copy_s.w
+; MSA64: copy_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call i64 @llvm.mips.copy.u.d(<2 x i64> %a, i32 1)
+  ret i64 %r
+}
+
+define void @addvi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: addvi_d:
+; CHECK: addvi.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bclri_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bclri_d:
+; CHECK: and.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 16)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsli_d(<2 x i64> * %ptr, <2 x i64> * %ptr2) {
+entry:
+; CHECK-LABEL: binsli_d:
+; CHECK: bsel.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %b = load <2 x i64>, <2 x i64> * %ptr2, align 16
+  %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %b, i32 4)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @binsri_d(<2 x i64> * %ptr, <2 x i64> * %ptr2) {
+entry:
+; CHECK-LABEL: binsri_d:
+; CHECK: binsri.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %b = load <2 x i64>, <2 x i64> * %ptr2, align 16
+  %r = call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %a, <2 x i64> %b, i32 5)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bnegi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bnegi_d:
+; CHECK: xor.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 9)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @bseti_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: bseti_d:
+; CHECK: or.v
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clei_s_d:
+; CHECK: clei_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %a, i32 15)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clei_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clei_u_d:
+; CHECK: clei_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clti_s_d:
+; CHECK: clti_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %a, i32 15)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @clti_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: clti_u_d:
+; CHECK: clti_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %a, i32 25)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ldi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: ldi_d:
+; CHECK: ldi.d
+  %r = call <2 x i64> @llvm.mips.ldi.d(i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_s_d:
+; CHECK: maxi_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @maxi_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: maxi_u_d:
+; CHECK: maxi_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_s_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: mini_s_d:
+; CHECK: mini_s.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @mini_u_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: mini_u_d:
+; CHECK: mini_u.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %a, i32 2)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @sldi_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: sldi_d:
+; CHECK: sldi.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %a, <2 x i64> %a, i32 1)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @slli_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: slli_d:
+; CHECK: slli.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.slli.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srai_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srai_d:
+; CHECK: srai.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srai.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srari_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srari_d:
+; CHECK: srari.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srari.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srli_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srli_d:
+; CHECK: srli.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srli.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @srlri_d(<2 x i64> * %ptr) {
+entry:
+; CHECK-LABEL: srlri_d:
+; CHECK: srlri.d
+  %a = load <2 x i64>, <2 x i64> * %ptr, align 16
+  %r = call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %a, i32 3)
+  store <2 x i64> %r, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ld_d(<2 x i64> * %ptr, i8 * %ldptr, i32 %offset) {
+entry:
+; CHECK-LABEL: ld_d
+; MSA32: addu $[[R0:[0-9]]], $5, $6
+; MSA64N32: sll $[[R2:[0-9]]], $6, 0
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addu $[[R0:[0-9]]], $[[R1]], $[[R2]]
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $5, $[[R1]]
+; CHECK: ld.d $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 %offset)
+  store <2 x i64> %a, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @ld_d2(<2 x i64> * %ptr, i8 * %ldptr) {
+entry:
+; CHECK-LABEL: ld_d2
+; MSA32: addiu $[[R0:[0-9]]], $5, 4096
+; MSA64N32: sll $[[R1:[0-9]]], $5, 0
+; MSA64N32: addiu $[[R0:[0-9]]], $[[R1]], 4096
+; MSA64N64: daddiu $[[R0:[0-9]]], $5, 4096
+; CHECK: ld.d $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 4096)
+  store <2 x i64> %a, <2 x i64> * %ptr, align 16
+  ret void
+}
+
+define void @st_d(<2 x i64> * %ptr, i8 * %ldptr, i32 %offset, i8 * %stptr) {
+entry:
+; CHECK-LABEL: st_d
+; MSA32: addu $[[R0:[0-9]]], $7, $6
+
+; MSA64N32-DAG: sll $[[R1:[0-9]]], $6, 0
+; MSA64N32-DAG: sll $[[R2:[0-9]+]], $7, 0
+; MSA64N32: addu $[[R0:[0-9]+]], $[[R2]], $[[R1]]
+
+; MSA64N64: sll $[[R1:[0-9]]], $6, 0
+; MSA64N64: daddu $[[R0:[0-9]]], $7, $[[R1]]
+; CHECK: st.d $w{{[0-9]+}}, 0($[[R0]])
+  %a = call <2 x i64> @llvm.mips.ld.d(i8* %ldptr, i32 0)
+  call void @llvm.mips.st.d(<2 x i64> %a, i8* %stptr, i32 %offset)
+  ret void
+}
+
+
+declare <8 x i16> @llvm.mips.ldi.h(i32)
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32)
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32)
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.ldi.w(i32)
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, <4 x i32>, i32)
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32)
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32)
+declare <2 x i64> @llvm.mips.ldi.d(i32)
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, <2 x i64>, i32)
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32)
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32)
+declare <16 x i8> @llvm.mips.ldi.b(i32)
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32)
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32)
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32)
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32)
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32)
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32)
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32)
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32)
+declare <16 x i8> @llvm.mips.ld.b(i8*, i32)
+declare <8 x i16> @llvm.mips.ld.h(i8*, i32)
+declare <4 x i32> @llvm.mips.ld.w(i8*, i32)
+declare <2 x i64> @llvm.mips.ld.d(i8*, i32)
+declare void @llvm.mips.st.b(<16 x i8>, i8*, i32)
+declare void @llvm.mips.st.h(<8 x i16>, i8*, i32)
+declare void @llvm.mips.st.w(<4 x i32>, i8*, i32)
+declare void @llvm.mips.st.d(<2 x i64>, i8*, i32)
diff --git a/test/CodeGen/Mips/msa/msa-nooddspreg.ll b/test/CodeGen/Mips/msa/msa-nooddspreg.ll
new file mode 100644
index 0000000000000..7cfc66650e6be
--- /dev/null
+++ b/test/CodeGen/Mips/msa/msa-nooddspreg.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=mips -mcpu=mips32r5 -mattr=+fp64,+msa,+nooddspreg < %s | FileCheck %s
+
+; Test that the register allocator honours +nooddspreg and does not pick an odd
+; single precision subregister of an MSA register.
+
+@f1 = external global float
+
+@f2 = external global float
+
+@v3 = external global <4 x float>
+
+@d1 = external global double
+
+define void @test() {
+; CHECK-LABEL: test:
+entry:
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %0 = load float, float * @f1
+  %1 = insertelement <4 x float> undef,    float %0, i32 0
+  %2 = insertelement <4 x float> %1,    float %0, i32 1
+  %3 = insertelement <4 x float> %2,    float %0, i32 2
+  %4 = insertelement <4 x float> %3,    float %0, i32 3
+
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %5 = load float, float * @f2
+  %6 = insertelement <4 x float> undef,    float %5, i32 0
+  %7 = insertelement <4 x float> %6,    float %5, i32 1
+  %8 = insertelement <4 x float> %7,    float %5, i32 2
+  %9 = insertelement <4 x float> %8,    float %5, i32 3
+
+  %10 = fadd <4 x float> %4, %9
+  store <4 x float> %10, <4 x float> * @v3
+  ret void
+}
+
+; Test that the register allocator hnours +noodspreg and does not pick an odd
+; single precision register for a load to perform a conversion to a double.
+
+define void @test2() {
+; CHECK-LABEL: test2:
+entry:
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %0 = load float, float * @f1
+  %1 = fpext float %0 to double
+; CHECK-NOT: lwc1 $f{{[13579]+}}
+; CHECK: lwc1 $f{{[02468]+}}
+  %2 = load float, float * @f2
+  %3 = fpext float %2 to double
+  %4 = fadd double %1, %3
+  store double%4, double * @d1
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/fast-math.ll b/test/CodeGen/NVPTX/fast-math.ll
index 9da26adc15114..d0a333d369ca4 100644
--- a/test/CodeGen/NVPTX/fast-math.ll
+++ b/test/CodeGen/NVPTX/fast-math.ll
@@ -1,10 +1,8 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
-
 declare float @llvm.nvvm.sqrt.f(float)
 
-
-; CHECK: sqrt_div
+; CHECK-LABEL: sqrt_div
 ; CHECK: sqrt.rn.f32
 ; CHECK: div.rn.f32
 define float @sqrt_div(float %a, float %b) {
@@ -13,7 +11,7 @@ define float @sqrt_div(float %a, float %b) {
   ret float %t2
 }
 
-; CHECK: sqrt_div_fast
+; CHECK-LABEL: sqrt_div_fast
 ; CHECK: sqrt.approx.f32
 ; CHECK: div.approx.f32
 define float @sqrt_div_fast(float %a, float %b) #0 {
@@ -22,22 +20,19 @@ define float @sqrt_div_fast(float %a, float %b) #0 {
   ret float %t2
 }
 
-
-; CHECK: fadd
-; CHECK: add.f32
+; CHECK-LABEL: fadd
+; CHECK: add.rn.f32
 define float @fadd(float %a, float %b) {
   %t1 = fadd float %a, %b
   ret float %t1
 }
 
-; CHECK: fadd_ftz
-; CHECK: add.ftz.f32
+; CHECK-LABEL: fadd_ftz
+; CHECK: add.rn.ftz.f32
 define float @fadd_ftz(float %a, float %b) #1 {
   %t1 = fadd float %a, %b
   ret float %t1
 }
 
-
-
 attributes #0 = { "unsafe-fp-math" = "true" }
 attributes #1 = { "nvptx-f32ftz" = "true" }
diff --git a/test/CodeGen/PowerPC/change-no-infs.ll b/test/CodeGen/PowerPC/change-no-infs.ll
new file mode 100644
index 0000000000000..0cd5eb5408e3e
--- /dev/null
+++ b/test/CodeGen/PowerPC/change-no-infs.ll
@@ -0,0 +1,67 @@
+; Check that we can enable/disable NoInfsFPMath and NoNaNsInFPMath via function
+; attributes.  An attribute on one function should not magically apply to the
+; next one.
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 -mattr=-vsx \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 -mattr=-vsx \
+; RUN:   -enable-no-infs-fp-math -enable-no-nans-fp-math \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+
+; The fcmp+select in these functions should be converted to a fsel instruction
+; when both NoInfsFPMath and NoNaNsInFPMath are enabled.
+
+; CHECK-LABEL: default0:
+define double @default0(double %a, double %y, double %z) {
+entry:
+; SAFE-NOT:  fsel
+; UNSAFE:    fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: unsafe_math_off:
+define double @unsafe_math_off(double %a, double %y, double %z) #0 #2 {
+entry:
+; SAFE-NOT:   fsel
+; UNSAFE-NOT: fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: default1:
+define double @default1(double %a, double %y, double %z) {
+; SAFE-NOT:  fsel
+; UNSAFE:    fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: unsafe_math_on:
+define double @unsafe_math_on(double %a, double %y, double %z) #1 #3 {
+entry:
+; SAFE-NOT:   fsel
+; UNSAFE-NOT: fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+; CHECK-LABEL: default2:
+define double @default2(double %a, double %y, double %z) {
+; SAFE-NOT:  fsel
+; UNSAFE:    fsel
+  %cmp = fcmp ult double %a, 0.000000e+00
+  %z.y = select i1 %cmp, double %z, double %y
+  ret double %z.y
+}
+
+attributes #0 = { "no-infs-fp-math"="false" }
+attributes #1 = { "no-nans-fp-math"="false" }
+
+attributes #2 = { "no-infs-fp-math"="false" }
+attributes #3 = { "no-infs-fp-math"="true" }
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
index b61acab7f7cb6..98862cd049a5d 100644
--- a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK: mfvsrd [[TOGPR:[0-9]+]],
 ; CHECK: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
 ; CHECK: extsw 3, [[RSHREG]]
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
 ; CHECK-P7-DAG: stxvw4x 34,
 ; CHECK-P7: lwax 3, [[ELEMOFFREG]],
 ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
@@ -52,7 +52,7 @@ entry:
 ; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
 ; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
 ; CHECK: mfvsrd 3,
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 3
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28
 ; CHECK-P7-DAG: stxvd2x 34,
 ; CHECK-P7: ldx 3, [[ELEMOFFREG]],
 ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
@@ -75,7 +75,7 @@ entry:
 ; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[TRUNCREG]]
 ; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
 ; CHECK: xscvspdpn 1,
-; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29
 ; CHECK-P7-DAG: stxvw4x 34,
 ; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
 ; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
index 49980da6eb8fe..e4f8f3fb6ca99 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -7,11 +7,18 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: test:
 ; CHECK-NEXT: call        .Lbitcast@FUNCTION{{$}}
+; CHECK-NEXT: call        .Lbitcast@FUNCTION{{$}}
 ; CHECK-NEXT: call        .Lbitcast.1@FUNCTION{{$}}
 ; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0
 ; CHECK-NEXT: call        .Lbitcast.2@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const   $push[[L1:[0-9]+]]=, 0
+; CHECK-NEXT: call        .Lbitcast.2@FUNCTION, $pop[[L1]]{{$}}
+; CHECK-NEXT: i32.const   $push[[L2:[0-9]+]]=, 0
+; CHECK-NEXT: call        .Lbitcast.2@FUNCTION, $pop[[L2]]{{$}}
+; CHECK-NEXT: call        foo0@FUNCTION
 ; CHECK-NEXT: i32.call    $drop=, .Lbitcast.3@FUNCTION{{$}}
 ; CHECK-NEXT: call        foo2@FUNCTION{{$}}
+; CHECK-NEXT: call        foo1@FUNCTION{{$}}
 ; CHECK-NEXT: call        foo3@FUNCTION{{$}}
 ; CHECK-NEXT: .endfunc
 
@@ -47,10 +54,19 @@ declare void @foo3()
 define void @test() {
 entry:
   call void bitcast (void (i32)* @has_i32_arg to void ()*)()
+  call void bitcast (void (i32)* @has_i32_arg to void ()*)()
   call void bitcast (i32 ()* @has_i32_ret to void ()*)()
   call void bitcast (void ()* @foo0 to void (i32)*)(i32 0)
+  %p = bitcast void ()* @foo0 to void (i32)*
+  call void %p(i32 0)
+  %q = bitcast void ()* @foo0 to void (i32)*
+  call void %q(i32 0)
+  %r = bitcast void (i32)* %q to void ()*
+  call void %r()
   %t = call i32 bitcast (void ()* @foo1 to i32 ()*)()
   call void bitcast (void ()* @foo2 to void ()*)()
+  call void @foo1()
   call void @foo3()
+
   ret void
 }
diff --git a/test/CodeGen/X86/atom-bypass-slow-division-64.ll b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
deleted file mode 100644
index 5980b7907c9ff..0000000000000
--- a/test/CodeGen/X86/atom-bypass-slow-division-64.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -mcpu=atom -march=x86-64 | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
-
-; Additional tests for 64-bit divide bypass
-
-define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: ret
-; CHECK: divw
-; CHECK: ret
-  %result = sdiv i64 %a, %b
-  ret i64 %result
-}
-
-define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_remainder:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: ret
-; CHECK: divw
-; CHECK: ret
-  %result = srem i64 %a, %b
-  ret i64 %result
-}
-
-define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: movq %rdi, %rax
-; CHECK: orq %rsi, %rax
-; CHECK-NEXT: testq $-65536, %rax
-; CHECK-NEXT: je
-; CHECK: idivq
-; CHECK: divw
-; CHECK: addq
-; CHECK: ret
-; CHECK-NOT: idivq
-; CHECK-NOT: divw
-  %resultdiv = sdiv i64 %a, %b
-  %resultrem = srem i64 %a, %b
-  %result = add i64 %resultdiv, %resultrem
-  ret i64 %result
-}
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
deleted file mode 100644
index 79001e5de192e..0000000000000
--- a/test/CodeGen/X86/atom-bypass-slow-division.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
-
-define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: ret
-; CHECK: divb
-; CHECK: ret
-  %result = sdiv i32 %a, %b
-  ret i32 %result
-}
-
-define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_remainder:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: ret
-; CHECK: divb
-; CHECK: ret
-  %result = srem i32 %a, %b
-  ret i32 %result
-}
-
-define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: orl %ecx, %edx
-; CHECK-NEXT: testl $-256, %edx
-; CHECK-NEXT: je
-; CHECK: idivl
-; CHECK: divb
-; CHECK: addl
-; CHECK: ret
-; CHECK-NOT: idivl
-; CHECK-NOT: divb
-  %resultdiv = sdiv i32 %a, %b
-  %resultrem = srem i32 %a, %b
-  %result = add i32 %resultdiv, %resultrem
-  ret i32 %result
-}
-
-define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: Test_use_div_and_idiv:
-; CHECK: idivl
-; CHECK: divb
-; CHECK: divl
-; CHECK: divb
-; CHECK: addl
-; CHECK: ret
-  %resultidiv = sdiv i32 %a, %b
-  %resultdiv = udiv i32 %a, %b
-  %result = add i32 %resultidiv, %resultdiv
-  ret i32 %result
-}
-
-define i32 @Test_use_div_imm_imm() nounwind {
-; CHECK-LABEL: Test_use_div_imm_imm:
-; CHECK: movl $64
-  %resultdiv = sdiv i32 256, 4
-  ret i32 %resultdiv
-}
-
-define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_div_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
-  %resultdiv = sdiv i32 %a, 33
-  ret i32 %resultdiv
-}
-
-define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_rem_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
-  %resultrem = srem i32 %a, 33
-  ret i32 %resultrem
-}
-
-define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_divrem_reg_imm:
-; CHECK-NOT: test
-; CHECK-NOT: idiv
-; CHECK-NOT: divb
-  %resultdiv = sdiv i32 %a, 33
-  %resultrem = srem i32 %a, 33
-  %result = add i32 %resultdiv, %resultrem
-  ret i32 %result
-}
-
-define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_div_imm_reg:
-; CHECK: test
-; CHECK: idiv
-; CHECK: divb
-  %resultdiv = sdiv i32 4, %a
-  ret i32 %resultdiv
-}
-
-define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
-; CHECK-LABEL: Test_use_rem_imm_reg:
-; CHECK: test
-; CHECK: idiv
-; CHECK: divb
-  %resultdiv = sdiv i32 4, %a
-  ret i32 %resultdiv
-}
diff --git a/test/CodeGen/X86/atomic-eflags-reuse.ll b/test/CodeGen/X86/atomic-eflags-reuse.ll
index dc1814b55cd3f..9902325fd1481 100644
--- a/test/CodeGen/X86/atomic-eflags-reuse.ll
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@@ -176,4 +176,84 @@ entry:
   ret i8 %tmp2
 }
 
+define i8 @test_add_1_cmov_cmov(i64* %p, i8* %q) #0 {
+; TODO: It's possible to use "lock inc" here, but both cmovs need to be updated.
+; CHECK-LABEL: test_add_1_cmov_cmov:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    lock xaddq %rax, (%rdi)
+; CHECK-NEXT:    testq   %rax, %rax
+entry:
+  %add = atomicrmw add i64* %p, i64 1 seq_cst
+  %cmp = icmp slt i64 %add, 0
+  %s1 = select i1 %cmp, i8 12, i8 34
+  store i8 %s1, i8* %q
+  %s2 = select i1 %cmp, i8 56, i8 78
+  ret i8 %s2
+}
+
+define i8 @test_sub_1_setcc_eq(i64* %p) #0 {
+; CHECK-LABEL: test_sub_1_setcc_eq:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lock decq (%rdi)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+entry:
+  %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+  %tmp1 = icmp eq i64 %tmp0, 1
+  %tmp2 = zext i1 %tmp1 to i8
+  ret i8 %tmp2
+}
+
+define i8 @test_add_5_setcc_ne(i64* %p) #0 {
+; CHECK-LABEL: test_add_5_setcc_ne:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lock addq $5, (%rdi)
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+entry:
+  %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
+  %tmp1 = icmp ne i64 %tmp0, -5
+  %tmp2 = zext i1 %tmp1 to i8
+  ret i8 %tmp2
+}
+
+define i8 @test_add_5_setcc_ne_comparand_mismatch(i64* %p) #0 {
+; CHECK-LABEL: test_add_5_setcc_ne_comparand_mismatch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $5, %eax
+; CHECK-NEXT:    lock xaddq %rax, (%rdi)
+; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+entry:
+  %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
+  %tmp1 = icmp ne i64 %tmp0, 0
+  %tmp2 = zext i1 %tmp1 to i8
+  ret i8 %tmp2
+}
+
+declare void @g()
+define zeroext i1 @test_sub_1_setcc_jcc(i64* %p) local_unnamed_addr #0 {
+; TODO: It's possible to use "lock dec" here, but both uses of the cmp need to
+; be updated.
+; CHECK-LABEL: test_sub_1_setcc_jcc:
+; CHECK:       # BB#0: # %entry
+; CHECK:	     movq $-1, %rax
+; CHECK-NEXT:  lock xaddq %rax, (%rdi)
+; CHECK-NEXT:  cmpq $1, %rax
+; CHECK-NEXT:  sete %bl
+; CHECK-NEXT:  jne
+entry:
+  %add = atomicrmw volatile add i64* %p, i64 -1 seq_cst
+  %cmp = icmp ne i64 %add, 1
+  %not = xor i1 %cmp, true
+  br i1 %cmp, label %else, label %then
+then:
+  tail call void @g()
+  br label %else
+else:
+  ret i1 %not
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index c8e806890d072..a7cd8cf239843 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -62,6 +62,17 @@ define <8 x float> @fptrunc00(<8 x double> %b) nounwind {
   ret <8 x float> %a
 }
 
+define <4 x float> @fptrunc01(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: fptrunc01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %ext = extractelement <2 x double> %a0, i32 0
+  %cvt = fptrunc double %ext to float
+  %res = insertelement <4 x float> %a1, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
 define <4 x double> @fpext00(<4 x float> %b) nounwind {
 ; CHECK-LABEL: fpext00:
 ; CHECK:       # BB#0:
@@ -71,6 +82,17 @@ define <4 x double> @fpext00(<4 x float> %b) nounwind {
   ret <4 x double> %a
 }
 
+define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: fpext01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %ext = extractelement <4 x float> %a1, i32 0
+  %cvt = fpext float %ext to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
 define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp {
 ; CHECK-LABEL: funcA:
 ; CHECK:       # BB#0:
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 789ca24139401..c729b988cfb8b 100755
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -39,3 +39,29 @@ define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
   %B = trunc <16 x i16> %A to <16 x i8>
   ret <16 x i8> %B
 }
+
+define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
+; CHECK-LABEL: usat_trunc_wb_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  ret <16 x i8> %x6
+}
+
+define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) {
+; CHECK-LABEL: usat_trunc_dw_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %x3 = icmp ult <8 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x5 = select <8 x i1> %x3, <8 x i32> %i, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x6 = trunc <8 x i32> %x5 to <8 x i16>
+  ret <8 x i16> %x6
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 5e50a3aef2f29..87deeb9e16c03 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+
 
 define <16 x float> @sitof32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: sitof32:
@@ -12,255 +18,304 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind {
 }
 
 define <8 x double> @sltof864(<8 x i64> %a) {
-; KNL-LABEL: sltof864:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof864:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof864:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: sltof864:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; DQ-NEXT:    retq
   %b = sitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
 }
 
 define <4 x double> @sltof464(<4 x i64> %a) {
-; KNL-LABEL: sltof464:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof464:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof464:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof464:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2pd %ymm0, %ymm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sltof464:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %b
 }
 
 define <2 x float> @sltof2f32(<2 x i64> %a) {
-; KNL-LABEL: sltof2f32:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
-; KNL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof2f32:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm1
+; NODQ-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof2f32:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: sltof2f32:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: sltof2f32:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float>%b
 }
 
 define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
-; KNL-LABEL: sltof4f32_mem:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vmovdqu (%rdi), %ymm0
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof4f32_mem:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vmovdqu (%rdi), %ymm0
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof4f32_mem:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof4f32_mem:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2psy (%rdi), %xmm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sltof4f32_mem:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovups (%rdi), %ymm0
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %a1 = load <4 x i64>, <4 x i64>* %a, align 8
   %b = sitofp <4 x i64> %a1 to <4 x float>
   ret <4 x float>%b
 }
 
 define <4 x i64> @f64tosl(<4 x double> %a) {
-; KNL-LABEL: f64tosl:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vcvttsd2si %xmm1, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; KNL-NEXT:    vcvttsd2si %xmm1, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; KNL-NEXT:    vcvttsd2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm0
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: f64tosl:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm1
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm0
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: f64tosl:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttpd2qq %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: f64tosl:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: f64tosl:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i64>
   ret <4 x i64> %b
 }
 
 define <4 x i64> @f32tosl(<4 x float> %a) {
-; KNL-LABEL: f32tosl:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; KNL-NEXT:    vcvttss2si %xmm1, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; KNL-NEXT:    vcvttss2si %xmm2, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; KNL-NEXT:    vcvttss2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm2
-; KNL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; KNL-NEXT:    vcvttss2si %xmm0, %rax
-; KNL-NEXT:    vmovq %rax, %xmm0
-; KNL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: f32tosl:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; NODQ-NEXT:    vcvttss2si %xmm1, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm1
+; NODQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; NODQ-NEXT:    vcvttss2si %xmm2, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT:    vcvttss2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm2
+; NODQ-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NODQ-NEXT:    vcvttss2si %xmm0, %rax
+; NODQ-NEXT:    vmovq %rax, %xmm0
+; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: f32tosl:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: f32tosl:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttps2qq %xmm0, %ymm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: f32tosl:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = fptosi <4 x float> %a to <4 x i64>
   ret <4 x i64> %b
 }
 
 define <4 x float> @sltof432(<4 x i64> %a) {
-; KNL-LABEL: sltof432:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: sltof432:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    retq
+;
+; VLDQ-LABEL: sltof432:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sltof432:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtqq2ps %ymm0, %xmm0
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sltof432:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <4 x float> @ultof432(<4 x i64> %a) {
-; KNL-LABEL: ultof432:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: ultof432:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: ultof432:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtuqq2ps %ymm0, %xmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: ultof432:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: ultof432:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    retq
   %b = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <8 x double> @ultof64(<8 x i64> %a) {
-; KNL-LABEL: ultof64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: ultof64:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: ultof64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: ultof64:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; DQ-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
 }
@@ -284,33 +339,33 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
 }
 
 define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
-; KNL-LABEL: fptoui_256:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptoui_256:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptoui_256:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttps2udq %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: fptoui_256:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvttps2udq %ymm0, %ymm0
+; VL-NEXT:    retq
   %b = fptoui <8 x float> %a to <8 x i32>
   ret <8 x i32> %b
 }
 
 define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
-; KNL-LABEL: fptoui_128:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptoui_128:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptoui_128:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttps2udq %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: fptoui_128:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
+; VL-NEXT:    retq
   %b = fptoui <4 x float> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -325,17 +380,17 @@ define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
 }
 
 define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
-; KNL-LABEL: fptoui_256d:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptoui_256d:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptoui_256d:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: fptoui_256d:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT:    retq
   %b = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -349,34 +404,34 @@ define <8 x double> @sitof64(<8 x i32> %a) {
   ret <8 x double> %b
 }
 define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: sitof64_mask:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT:    retq
+; NODQ-LABEL: sitof64_mask:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitof64_mask:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; SKX-NEXT:    retq
+; DQ-LABEL: sitof64_mask:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = sitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
   ret <8 x double> %3
 }
 define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: sitof64_maskz:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
+; NODQ-LABEL: sitof64_maskz:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitof64_maskz:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; DQ-LABEL: sitof64_maskz:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = sitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -402,19 +457,19 @@ define <4 x i32> @fptosi03(<4 x double> %a) {
 }
 
 define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
-; KNL-LABEL: fptrunc00:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; KNL-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: fptrunc00:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; NODQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: fptrunc00:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; SKX-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: fptrunc00:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; DQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
+; DQ-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
+; DQ-NEXT:    retq
   %a = fptrunc <16 x double> %b to <16 x float>
   ret <16 x float> %a
 }
@@ -429,25 +484,36 @@ define <4 x float> @fptrunc01(<4 x double> %b) {
 }
 
 define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
-; KNL-LABEL: fptrunc02:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
-; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: fptrunc02:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpslld $31, %xmm1, %xmm1
+; NOVL-NEXT:    vpsrad $31, %xmm1, %xmm1
+; NOVL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; NOVL-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fptrunc02:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
-; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; SKX-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; VL-LABEL: fptrunc02:
+; VL:       ## BB#0:
+; VL-NEXT:    vpslld $31, %xmm1, %xmm1
+; VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; VL-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VL-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
   ret <4 x float> %c
 }
 
+define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: fptrunc03:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    retq
+  %ext = extractelement <2 x double> %a0, i32 0
+  %cvt = fptrunc double %ext to float
+  %res = insertelement <4 x float> %a1, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
 define <8 x double> @fpext00(<8 x float> %b) nounwind {
 ; ALL-LABEL: fpext00:
 ; ALL:       ## BB#0:
@@ -458,24 +524,35 @@ define <8 x double> @fpext00(<8 x float> %b) nounwind {
 }
 
 define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
-; KNL-LABEL: fpext01:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vcvtps2pd %xmm0, %ymm0
-; KNL-NEXT:    vcmpltpd %ymm2, %ymm1, %ymm1
-; KNL-NEXT:    vandpd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: fpext01:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vcvtps2pd %xmm0, %ymm0
+; NOVL-NEXT:    vcmpltpd %ymm2, %ymm1, %ymm1
+; NOVL-NEXT:    vandpd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: fpext01:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
-; SKX-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
+; VL-LABEL: fpext01:
+; VL:       ## BB#0:
+; VL-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
+; VL-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; VL-NEXT:    retq
   %a = fpext <4 x float> %b to <4 x double>
   %mask = fcmp ogt <4 x double>%a1, %b1
   %c = select <4 x i1>%mask,  <4 x double>%a, <4 x double>zeroinitializer
   ret <4 x double> %c
 }
 
+define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: fpext02:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %ext = extractelement <4 x float> %a1, i32 0
+  %cvt = fpext float %ext to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
 define double @funcA(i64* nocapture %e) {
 ; ALL-LABEL: funcA:
 ; ALL:       ## BB#0: ## %entry
@@ -589,53 +666,53 @@ define i32 @float_to_int(float %x) {
 }
 
 define <16 x double> @uitof64(<16 x i32> %a) nounwind {
-; KNL-LABEL: uitof64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; KNL-NEXT:    vmovaps %zmm2, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: uitof64:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; NODQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; NODQ-NEXT:    vmovaps %zmm2, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: uitof64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; SKX-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; SKX-NEXT:    vmovaps %zmm2, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: uitof64:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; DQ-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; DQ-NEXT:    vmovaps %zmm2, %zmm0
+; DQ-NEXT:    retq
   %b = uitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %b
 }
 define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: uitof64_mask:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT:    retq
+; NODQ-LABEL: uitof64_mask:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: uitof64_mask:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; SKX-NEXT:    retq
+; DQ-LABEL: uitof64_mask:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = uitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
   ret <8 x double> %3
 }
 define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: uitof64_maskz:
-; KNL:       ## BB#0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
+; NODQ-LABEL: uitof64_maskz:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    kmovw %edi, %k1
+; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: uitof64_maskz:
-; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; DQ-LABEL: uitof64_maskz:
+; DQ:       ## BB#0:
+; DQ-NEXT:    kmovb %edi, %k1
+; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; DQ-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = uitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -643,17 +720,17 @@ define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
 }
 
 define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
-; KNL-LABEL: uitof64_256:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitof64_256:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitof64_256:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2pd %xmm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitof64_256:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %b
 }
@@ -668,33 +745,33 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
 }
 
 define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
-; KNL-LABEL: uitof32_256:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitof32_256:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitof32_256:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2ps %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitof32_256:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT:    retq
   %b = uitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %b
 }
 
 define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
-; KNL-LABEL: uitof32_128:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitof32_128:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitof32_128:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitof32_128:
+; VL:       ## BB#0:
+; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -736,21 +813,21 @@ define double @uitofp03(i32 %a) nounwind {
 }
 
 define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
-; KNL-LABEL: sitofp_16i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: sitofp_16i1_float:
+; NODQ:       ## BB#0:
+; NODQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NODQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; NODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_16i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %zmm0
-; SKX-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; SKX-NEXT:    retq
+; DQ-LABEL: sitofp_16i1_float:
+; DQ:       ## BB#0:
+; DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; DQ-NEXT:    vpmovm2d %k0, %zmm0
+; DQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; DQ-NEXT:    retq
   %mask = icmp slt <16 x i32> %a, zeroinitializer
   %1 = sitofp <16 x i1> %mask to <16 x float>
   ret <16 x float> %1
@@ -799,157 +876,259 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
 }
 
 define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
-; KNL-LABEL: sitofp_16i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
-; KNL-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm1, %ymm1
-; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; KNL-NEXT:    retq
+; NOVLDQ-LABEL: sitofp_16i1_double:
+; NOVLDQ:       ## BB#0:
+; NOVLDQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; NOVLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm1, %ymm1
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_16i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
+; VLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; VLDQ-NEXT:    vpmovm2d %k1, %ymm0
+; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm1
+; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sitofp_16i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; VLNODQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; VLNODQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k2} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm1, %ymm1 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLNODQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_16i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
-; SKX-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
-; SKX-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
-; SKX-NEXT:    vpmovm2d %k1, %ymm0
-; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; SKX-NEXT:    vpmovm2d %k0, %ymm1
-; SKX-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; SKX-NEXT:    retq
+; AVX512DQ-LABEL: sitofp_16i1_double:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; AVX512DQ-NEXT:    vpmovm2q %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
   %1 = sitofp <16 x i1> %cmpres to <16 x double>
   ret <16 x double> %1
 }
 
 define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
-; KNL-LABEL: sitofp_8i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT:    retq
+; NOVLDQ-LABEL: sitofp_8i1_double:
+; NOVLDQ:       ## BB#0:
+; NOVLDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_8i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %ymm0
-; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: sitofp_8i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
+; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sitofp_8i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_8i1_double:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x double>
   ret <8 x double> %1
 }
 
 define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
-; KNL-LABEL: sitofp_8i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vcmpltps %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NOVLDQ-LABEL: sitofp_8i1_float:
+; NOVLDQ:       ## BB#0:
+; NOVLDQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVLDQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; NOVLDQ-NEXT:    vcmpltps %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_8i1_float:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; VLDQ-NEXT:    vcmpltps %ymm0, %ymm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
+; VLDQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_8i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vcmpltps %ymm0, %ymm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %ymm0
-; SKX-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_8i1_float:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT:    vcmpltps %ymm0, %ymm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; VLNODQ-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitofp_8i1_float:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x float>
   ret <8 x float> %1
 }
 
 define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
-; KNL-LABEL: sitofp_4i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_4i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_4i1_float:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_4i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_4i1_float:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
   %1 = sitofp <4 x i1> %cmpres to <4 x float>
   ret <4 x float> %1
 }
 
 define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
-; KNL-LABEL: sitofp_4i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_4i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; NOVL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_4i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; VLDQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_4i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_4i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
   %1 = sitofp <4 x i1> %cmpres to <4 x double>
   ret <4 x double> %1
 }
 
 define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
-; KNL-LABEL: sitofp_2i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
-; KNL-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_2i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
+; NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_2i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
-; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLDQ-LABEL: sitofp_2i1_float:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sitofp_2i1_float:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
   %1 = sitofp <2 x i1> %cmpres to <2 x float>
   ret <2 x float> %1
 }
 
 define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
-; KNL-LABEL: sitofp_2i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: sitofp_2i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: sitofp_2i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2q %k0, %xmm0
+; VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: sitofp_2i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
-; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    vcvtqq2pd %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: sitofp_2i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT:    retq
   %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
   %1 = sitofp <2 x i1> %cmpres to <2 x double>
   ret <2 x double> %1
@@ -989,174 +1168,187 @@ define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
 }
 
 define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
-; KNL-LABEL: uitofp_16i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    movq {{.*}}(%rip), %rax
-; KNL-NEXT:    vpbroadcastq %rax, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT:    kshiftrw $8, %k1, %k1
-; KNL-NEXT:    vpbroadcastq %rax, %zmm1 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm1, %ymm1
-; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm1
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_16i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT:    movq {{.*}}(%rip), %rax
+; NOVL-NEXT:    vpbroadcastq %rax, %zmm0 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT:    kshiftrw $8, %k1, %k1
+; NOVL-NEXT:    vpbroadcastq %rax, %zmm1 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm1, %ymm1
+; NOVL-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_16i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; SKX-NEXT:    movl {{.*}}(%rip), %eax
-; SKX-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; SKX-NEXT:    kshiftrw $8, %k1, %k1
-; SKX-NEXT:    vpbroadcastd %eax, %ymm1 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %ymm1, %zmm1
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_16i1_double:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; VL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; VL-NEXT:    movl {{.*}}(%rip), %eax
+; VL-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT:    kshiftrw $8, %k1, %k1
+; VL-NEXT:    vpbroadcastd %eax, %ymm1 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; VL-NEXT:    retq
   %mask = icmp slt <16 x i32> %a, zeroinitializer
   %1 = uitofp <16 x i1> %mask to <16 x double>
   ret <16 x double> %1
 }
 
 define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
-; KNL-LABEL: uitofp_8i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_8i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_8i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2ps %ymm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_8i1_float:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT:    retq
   %mask = icmp slt <8 x i32> %a, zeroinitializer
   %1 = uitofp <8 x i1> %mask to <8 x float>
   ret <8 x float> %1
 }
 
 define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
-; KNL-LABEL: uitofp_8i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_8i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; NOVL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_8i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_8i1_double:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT:    retq
   %mask = icmp slt <8 x i32> %a, zeroinitializer
   %1 = uitofp <8 x i1> %mask to <8 x double>
   ret <8 x double> %1
 }
 
 define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
-; KNL-LABEL: uitofp_4i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
-; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_4i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; NOVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_4i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_4i1_float:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT:    retq
   %mask = icmp slt <4 x i32> %a, zeroinitializer
   %1 = uitofp <4 x i1> %mask to <4 x float>
   ret <4 x float> %1
 }
 
 define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
-; KNL-LABEL: uitofp_4i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpsrld $31, %xmm0, %xmm0
-; KNL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_4i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpsrld $31, %xmm0, %xmm0
+; NOVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_4i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2pd %xmm0, %ymm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_4i1_double:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT:    retq
   %mask = icmp slt <4 x i32> %a, zeroinitializer
   %1 = uitofp <4 x i1> %mask to <4 x double>
   ret <4 x double> %1
 }
 
 define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; KNL-LABEL: uitofp_2i1_float:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_2i1_float:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NOVL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpextrq $1, %xmm0, %rax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
+; NOVL-NEXT:    vmovq %xmm0, %rax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NOVL-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_2i1_float:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VL-LABEL: uitofp_2i1_float:
+; VL:       ## BB#0:
+; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VL-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT:    retq
   %mask = icmp ult <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x float>
   ret <2 x float> %1
 }
 
 define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
-; KNL-LABEL: uitofp_2i1_double:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: uitofp_2i1_double:
+; NOVL:       ## BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NOVL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; NOVL-NEXT:    retq
+;
+; VLDQ-LABEL: uitofp_2i1_double:
+; VLDQ:       ## BB#0:
+; VLDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLDQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLDQ-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
 ;
-; SKX-LABEL: uitofp_2i1_double:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
-; SKX-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
-; SKX-NEXT:    vcvtuqq2pd %xmm0, %xmm0
-; SKX-NEXT:    retq
+; VLNODQ-LABEL: uitofp_2i1_double:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLNODQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT:    retq
   %mask = icmp ult <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x double>
   ret <2 x double> %1
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 42579377ef391..3f427298c177c 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -179,3 +179,22 @@ define float @pr30561_f32(float %b, float %a, i1 %c) {
   %cond = select i1 %c, float %a, float %b
   ret float %cond
 }
+
+define <16 x i16> @pr31515(<16 x i1> %a, <16 x i1> %b, <16 x i16> %c) nounwind {
+; CHECK-LABEL: pr31515:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxbd %xmm1, %zmm1
+; CHECK-NEXT:    vpslld $31, %zmm1, %zmm1
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT:    vpslld $31, %zmm0, %zmm0
+; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1}
+; CHECK-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
+; CHECK-NEXT:    vpandn %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %mask = and <16 x i1> %a, %b
+  %res = select <16 x i1> %mask, <16 x i16> zeroinitializer, <16 x i16> %c
+  ret <16 x i16> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index 04d21ecd3e824..fb6c55b26e7c4 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -505,9 +505,8 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
 define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
 ; KNL-LABEL: usat_trunc_wb_256_mem:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqu %xmm0, (%rdi)
 ; KNL-NEXT:    retq
 ;
@@ -525,9 +524,8 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
 define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
 ; KNL-LABEL: usat_trunc_wb_256:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: usat_trunc_wb_256:
@@ -607,3 +605,103 @@ define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
   ret void
 }
 
+define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
+; KNL-LABEL: usat_trunc_db_1024:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_db_1024:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
+; SKX-NEXT:    vpmovdw %zmm0, %ymm0
+; SKX-NEXT:    vpmovdw %zmm1, %ymm1
+; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vpmovwb %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  ret <32 x i8> %x6
+}
+
+define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
+; KNL-LABEL: usat_trunc_db_1024_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
+; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqu %ymm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_db_1024_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
+; SKX-NEXT:    vpmovdw %zmm0, %ymm0
+; SKX-NEXT:    vpmovdw %zmm1, %ymm1
+; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
+; SKX-NEXT:    retq
+  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <32 x i32> %x5 to <32 x i8>
+  store <32 x i8>%x6, <32 x i8>* %p, align 1
+  ret void
+}
+
+define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
+; ALL-LABEL: usat_trunc_dw_512:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusdw %zmm0, %ymm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %x6 = trunc <16 x i32> %x5 to <16 x i16>
+  ret <16 x i16> %x6
+}
+
+define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
+; ALL-LABEL: usat_trunc_wb_128:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <8 x i16> %x5 to <8 x i8>
+  ret <8 x i8>%x6
+}
+
+define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
+; KNL-LABEL: usat_trunc_qw_1024:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
+; KNL-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vpmovqd %zmm1, %ymm1
+; KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_qw_1024:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
+; SKX-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
+; SKX-NEXT:    vpmovqd %zmm0, %ymm0
+; SKX-NEXT:    vpmovqd %zmm1, %ymm1
+; SKX-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vpmovdw %zmm0, %ymm0
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x6 = trunc <16 x i64> %x5 to <16 x i16>
+  ret <16 x i16> %x6
+}
+
diff --git a/test/CodeGen/X86/bypass-slow-division-32.ll b/test/CodeGen/X86/bypass-slow-division-32.ll
new file mode 100644
index 0000000000000..ea545d22385c1
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Check that 32-bit division is bypassed correctly.
+; RUN: llc < %s -mattr=+idivl-to-divb -mtriple=i686-linux | FileCheck %s
+
+define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    testl $-256, %edx
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    testl $-256, %edx
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %ah, %eax # NOREX
+; CHECK-NEXT:    retl
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient_and_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    testl $-256, %edx
+; CHECK-NEXT:    je .LBB2_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB2_1:
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %ah, %edx # NOREX
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: Test_use_div_and_idiv:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    orl %ebx, %edi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %ebx
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    jne .LBB3_5
+; CHECK-NEXT:    jmp .LBB3_4
+; CHECK-NEXT:  .LBB3_1:
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %bl
+; CHECK-NEXT:    movzbl %al, %esi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    je .LBB3_4
+; CHECK-NEXT:  .LBB3_5:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    jmp .LBB3_6
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %bl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:  .LBB3_6:
+; CHECK-NEXT:    addl %eax, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    retl
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @Test_use_div_imm_imm() nounwind {
+; CHECK-LABEL: Test_use_div_imm_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $64, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_div_reg_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1041204193, %eax # imm = 0x3E0F83E1
+; CHECK-NEXT:    imull {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $3, %edx
+; CHECK-NEXT:    leal (%edx,%eax), %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_rem_reg_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl $1041204193, %edx # imm = 0x3E0F83E1
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    imull %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $3, %edx
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shll $5, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    retl
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_divrem_reg_imm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl $1041204193, %edx # imm = 0x3E0F83E1
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    imull %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $3, %edx
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shll $5, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_div_imm_reg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testl $-256, %ecx
+; CHECK-NEXT:    je .LBB8_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB8_1:
+; CHECK-NEXT:    movb $4, %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK-LABEL: Test_use_rem_imm_reg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testl $-256, %ecx
+; CHECK-NEXT:    je .LBB9_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    idivl %ecx
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB9_1:
+; CHECK-NEXT:    movb $4, %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %cl
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/bypass-slow-division-64.ll b/test/CodeGen/X86/bypass-slow-division-64.ll
new file mode 100644
index 0000000000000..b067f9e1503c6
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Check that 64-bit division is bypassed correctly.
+; RUN: llc < %s -mattr=+idivq-to-divl -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; Additional tests for 64-bit divide bypass
+
+define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    retq
+  %result = sdiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    retq
+  %result = srem i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: Test_get_quotient_and_remainder:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    je .LBB2_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB2_1:
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    retq
+  %resultdiv = sdiv i64 %a, %b
+  %resultrem = srem i64 %a, %b
+  %result = add i64 %resultdiv, %resultrem
+  ret i64 %result
+}
diff --git a/test/CodeGen/X86/bypass-slow-division-tune.ll b/test/CodeGen/X86/bypass-slow-division-tune.ll
new file mode 100644
index 0000000000000..b6a53130cf23e
--- /dev/null
+++ b/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -0,0 +1,55 @@
+; Check that a division is bypassed when appropriate only.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=atom       < %s | FileCheck -check-prefixes=ATOM,CHECK %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s | FileCheck -check-prefixes=REST,CHECK %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake    < %s | FileCheck -check-prefixes=REST,CHECK %s
+
+; Verify that div32 is bypassed only for Atoms.
+define i32 @div32(i32 %a, i32 %b) {
+entry:
+; ATOM-LABEL: div32:
+; ATOM: orl   %{{.*}}, [[REG:%[a-z]+]]
+; ATOM: testl $-256, [[REG]]
+; ATOM: divb
+;
+; REST-LABEL: div32:
+; REST-NOT: divb
+;
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
+; Verify that div64 is always bypassed.
+define i64 @div64(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: div64:
+; CHECK:     orq     %{{.*}}, [[REG:%[a-z]+]]
+; CHECK:     shrq    $32, [[REG]]
+; CHECK:     divl
+;
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
+
+; Verify that no extra code is generated when optimizing for size.
+
+define i64 @div64_optsize(i64 %a, i64 %b) optsize {
+; CHECK-LABEL: div64_optsize:
+; CHECK-NOT: divl
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
+define i32 @div32_optsize(i32 %a, i32 %b) optsize {
+; CHECK-LABEL: div32_optsize:
+; CHECK-NOT: divb
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
+define i32 @div32_minsize(i32 %a, i32 %b) minsize {
+; CHECK-LABEL: div32_minsize:
+; CHECK-NOT: divb
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
diff --git a/test/CodeGen/X86/change-unsafe-fp-math.ll b/test/CodeGen/X86/change-unsafe-fp-math.ll
new file mode 100644
index 0000000000000..33a7ec9bfc794
--- /dev/null
+++ b/test/CodeGen/X86/change-unsafe-fp-math.ll
@@ -0,0 +1,56 @@
+; Check that we can enable/disable UnsafeFPMath via function attributes.  An
+; attribute on one function should not magically apply to the next one.
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+
+; The div in these functions should be converted to a mul when unsafe-fp-math
+; is enabled.
+
+; CHECK-LABEL: unsafe_fp_math_default0:
+define double @unsafe_fp_math_default0(double %x) {
+; SAFE:      divsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_off:
+define double @unsafe_fp_math_off(double %x) #0 {
+; SAFE:      divsd
+; UNSAFE:    divsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_default1:
+define double @unsafe_fp_math_default1(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE:      divsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_on:
+define double @unsafe_fp_math_on(double %x) #1 {
+; SAFE:      mulsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+; CHECK-LABEL: unsafe_fp_math_default2:
+define double @unsafe_fp_math_default2(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE:      divsd
+; UNSAFE:    mulsd
+  %div = fdiv double %x, 2.0
+  ret double %div
+}
+
+attributes #0 = { "unsafe-fp-math"="false" }
+attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index d24f27ddf22c7..5d05c699f4311 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -281,4 +281,54 @@ define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
 ; CHECK: setne
 ; CHECK: testl
 ; CHECK: setne
-}
-\ No newline at end of file
+}
+
+define i32 @test21(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test21
+; CHECK: shrq $41, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
+
+; AND-to-SHR transformation is enabled for eq/ne condition codes only.
+define i32 @test22(i64 %val) {
+  %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+  %cmp = icmp ult i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test22
+; CHECK-NOT: shrq $41
+; CHECK: retq
+}
+
+define i32 @test23(i64 %val) {
+  %and = and i64 %val, -1048576 ; 0xFFFFFFFFFFF00000
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test23
+; CHECK: testq $-1048576, %rdi
+; CHECK: setne %al
+; CHECK: retq
+}
+
+define i32 @test24(i64 %val) {
+  %and = and i64 %val, 281474976710655 ; 0x0000FFFFFFFFFFFF
+  %cmp = icmp ne i64 %and, 0
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+
+; CHECK-LABEL: test24
+; CHECK: shlq $16, %rdi
+; CHECK-NOT: test
+; CHECK: setne %al
+; CHECK: retq
+}
diff --git a/test/CodeGen/X86/cpus.ll b/test/CodeGen/X86/cpus.ll
index ee1f7bb5295be..20ce932a184b5 100644
--- a/test/CodeGen/X86/cpus.ll
+++ b/test/CodeGen/X86/cpus.ll
@@ -33,3 +33,4 @@
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index 13448a13ab4cb..8c12e7148aa7f 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -404,6 +404,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
 define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v16i8_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $15, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movb (%rdi,%rax), %al
@@ -411,6 +412,7 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
 ;
 ; AVX-LABEL: extractelement_v16i8_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $15, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    movb (%rdi,%rax), %al
@@ -426,6 +428,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $31, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movq %rsp, %rax
@@ -440,6 +443,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 ; AVX-NEXT:    movq %rsp, %rbp
 ; AVX-NEXT:    andq $-32, %rsp
 ; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $31, %edi
 ; AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX-NEXT:    movq %rsp, %rax
 ; AVX-NEXT:    movb (%rdi,%rax), %al
@@ -454,12 +458,14 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
 define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v8i16_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $7, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extractelement_v8i16_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $7, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX-NEXT:    retq
@@ -474,6 +480,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $15, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movzwl (%rsp,%rdi,2), %eax
@@ -487,6 +494,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
 ; AVX-NEXT:    movq %rsp, %rbp
 ; AVX-NEXT:    andq $-32, %rsp
 ; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $15, %edi
 ; AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX-NEXT:    movq %rbp, %rsp
@@ -500,12 +508,14 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
 define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v4i32_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $3, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extractelement_v4i32_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $3, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; AVX-NEXT:    retq
@@ -520,6 +530,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $7, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movl (%rsp,%rdi,4), %eax
@@ -533,6 +544,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    andl $7, %edi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    movl (%rsp,%rdi,4), %eax
 ; AVX1-NEXT:    movq %rbp, %rsp
@@ -554,12 +566,14 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
 define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
 ; SSE-LABEL: extractelement_v2i64_var:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $1, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -24(%rsp,%rdi,8), %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extractelement_v2i64_var:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    movq -24(%rsp,%rdi,8), %rax
 ; AVX-NEXT:    retq
@@ -574,6 +588,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
 ; SSE-NEXT:    movq %rsp, %rbp
 ; SSE-NEXT:    andq $-32, %rsp
 ; SSE-NEXT:    subq $64, %rsp
+; SSE-NEXT:    andl $3, %edi
 ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, (%rsp)
 ; SSE-NEXT:    movq (%rsp,%rdi,8), %rax
@@ -587,6 +602,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
 ; AVX-NEXT:    movq %rsp, %rbp
 ; AVX-NEXT:    andq $-32, %rsp
 ; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $3, %edi
 ; AVX-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX-NEXT:    movq (%rsp,%rdi,8), %rax
 ; AVX-NEXT:    movq %rbp, %rsp
diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 946516c8a46dc..c418e67ecb670 100644
--- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -16,11 +16,11 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 ; CHECK-NEXT: movl 20(%esp), %edx
 ; CHECK-NEXT: paddd (%edx), %xmm0
 ; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl (%ecx,%edx), %esi
-; CHECK-NEXT: movl 12(%ecx,%edx), %edi
-; CHECK-NEXT: movl 8(%ecx,%edx), %ebx
-; CHECK-NEXT: movl 4(%ecx,%edx), %edx
+; CHECK-NEXT: movl (%edx), %esi
+; CHECK-NEXT: movl 12(%edx), %edi
+; CHECK-NEXT: movl 8(%edx), %ebx
+; CHECK-NEXT: movl 4(%edx), %edx
+; CHECK-NEXT: shll	$4, %ecx
 ; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
 ; CHECK-NEXT: movl %edx, (%eax,%ecx)
 ; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
index 1fa7527742513..7b1926da245ca 100644
--- a/test/CodeGen/X86/i64-mem-copy.ll
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -68,9 +68,10 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
 
 define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) {
 ; X32-LABEL: PR23476:
+; X32: andl $7, %eax
 ; X32:         movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32:         movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    movsd %xmm0, (%eax)
+; X32-NEXT:    movsd %xmm0, (%ecx)
   %ext = extractelement <5 x i64> %in, i32 %index
   store i64 %ext, i64* %out, align 8
   ret void
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
index 39f5242a84775..81351511374c4 100644
--- a/test/CodeGen/X86/implicit-null-checks.mir
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -319,7 +319,7 @@ liveins:
   - { reg: '%rsi' }
 # CHECK:  bb.0.entry:
 # CHECK:  %rbx = MOV64rr %rdx
-# CHECK-NEXT:  %rdi = FAULTING_LOAD_OP %bb.3.is_null, 260, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT:  %rdi = FAULTING_LOAD_OP %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
 
 body:             |
   bb.0.entry:
diff --git a/test/CodeGen/X86/lzcnt-zext-cmp.ll b/test/CodeGen/X86/lzcnt-zext-cmp.ll
index 6f4cb84a2b9cb..c69dbf573f46d 100644
--- a/test/CodeGen/X86/lzcnt-zext-cmp.ll
+++ b/test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -3,6 +3,8 @@
 ; Eg: zext(or(setcc(cmp), setcc(cmp))) -> shr(or(lzcnt, lzcnt))
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
 
 ; Test one 32-bit input, output is 32-bit, no transformations expected.
 define i32 @test_zext_cmp0(i32 %a) {
diff --git a/test/CodeGen/X86/peephole.mir b/test/CodeGen/X86/peephole.mir
new file mode 100644
index 0000000000000..6391836e9ca24
--- /dev/null
+++ b/test/CodeGen/X86/peephole.mir
@@ -0,0 +1,40 @@
+# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt %s -o - | FileCheck %s
+--- |
+  define void @func() { ret void }
+...
+---
+# Check that instructions with MI.isBitcast() are only replaced by COPY if there
+# are no SUBREG_TO_REG users.
+# CHECK-LABEL: name: func
+name: func
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: fr32 }
+  - { id: 2, class: gr32 }
+
+  - { id: 3, class: gr32 }
+  - { id: 4, class: fr32 }
+  - { id: 5, class: gr32 }
+  - { id: 6, class: gr64 }
+
+body: |
+  bb.0:
+    ; CHECK: %1 = VMOVDI2SSrr %0
+    ; CHECK: %7 = COPY %0
+    ; CHECK: NOOP implicit %7
+    %0 = MOV32ri 42
+    %1 = VMOVDI2SSrr %0
+    %2 = MOVSS2DIrr %1
+    NOOP implicit %2
+
+    ; CHECK: %4 = VMOVDI2SSrr %3
+    ; CHECK-NOT: COPY
+    ; CHECK: %5 = MOVSS2DIrr %4
+    ; CHECK: %6 = SUBREG_TO_REG %5, 0
+    ; CHECK: NOOP implicit %6
+    %3 = MOV32ri 42
+    %4 = VMOVDI2SSrr %3
+    %5 = MOVSS2DIrr %4
+    %6 = SUBREG_TO_REG %5, 0, %subreg.sub_32bit
+    NOOP implicit %6
+...
diff --git a/test/CodeGen/X86/slow-div.ll b/test/CodeGen/X86/slow-div.ll
deleted file mode 100644
index 82928521ac2b2..0000000000000
--- a/test/CodeGen/X86/slow-div.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s
-
-define i32 @div32(i32 %a, i32 %b) {
-entry:
-; DIV32-LABEL: div32:
-; DIV32: orl %{{.*}}, [[REG:%[a-z]+]]
-; DIV32: testl $-256, [[REG]]
-; DIV32: divb
-; DIV64-LABEL: div32:
-; DIV64-NOT: divb
-  %div = sdiv i32 %a, %b
-  ret i32 %div
-}
-
-define i64 @div64(i64 %a, i64 %b) {
-entry:
-; DIV32-LABEL: div64:
-; DIV32-NOT: divw
-; DIV64-LABEL: div64:
-; DIV64: orq %{{.*}}, [[REG:%[a-z]+]]
-; DIV64: testq   $-65536, [[REG]]
-; DIV64: divw
-  %div = sdiv i64 %a, %b
-  ret i64 %div
-}
-
-; Verify that no extra code is generated when optimizing for size.
-
-define i32 @div32_optsize(i32 %a, i32 %b) optsize {
-; DIV32-LABEL: div32_optsize:
-; DIV32-NOT: divb
-  %div = sdiv i32 %a, %b
-  ret i32 %div
-}
-
-define i32 @div32_minsize(i32 %a, i32 %b) minsize {
-; DIV32-LABEL: div32_minsize:
-; DIV32-NOT: divb
-  %div = sdiv i32 %a, %b
-  ret i32 %div
-}
-
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll
index 41e9a95bcdd8e..8251eb324a772 100644
--- a/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -46,6 +46,7 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver2        2>&1 | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver3        2>&1 | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver4        2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver1        2>&1 | FileCheck %s --check-prefix=FAST
 
 ; Other chips with slow unaligned memory accesses
 
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 4af9758f122dc..972a33f13cd00 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1257,15 +1257,12 @@ define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
 define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
 ; X32-LABEL: test_mm_cvtsi32_sd:
 ; X32:       # BB#0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    cvtsi2sdl %eax, %xmm1
-; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtsi32_sd:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2sdl %edi, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    cvtsi2sdl %edi, %xmm0
 ; X64-NEXT:    retq
   %cvt = sitofp i32 %a1 to double
   %res = insertelement <2 x double> %a0, double %cvt, i32 0
@@ -1293,14 +1290,12 @@ define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
 define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
 ; X32-LABEL: test_mm_cvtss_sd:
 ; X32:       # BB#0:
-; X32-NEXT:    cvtss2sd %xmm1, %xmm1
-; X32-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT:    cvtss2sd %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtss_sd:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtss2sd %xmm1, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    cvtss2sd %xmm1, %xmm0
 ; X64-NEXT:    retq
   %ext = extractelement <4 x float> %a1, i32 0
   %cvt = fpext float %ext to double
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 85c7875d923b8..1dc8b7abd2071 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -12,6 +12,7 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movl $76, (%esp,%eax,4)
 ; X32-NEXT:    movl (%esp), %eax
@@ -21,9 +22,10 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t0:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl $76, -24(%rsp,%rax,4)
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movl $76, -24(%rsp,%rdi,4)
 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    retq
   %t13 = insertelement <4 x i32> %t8, i32 76, i32 %t7
@@ -38,6 +40,7 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movl $76, %ecx
 ; X32-NEXT:    pinsrd $0, %ecx, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%esp)
@@ -48,11 +51,12 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t1:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movl $76, %eax
 ; X64-NEXT:    pinsrd $0, %eax, %xmm0
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl -24(%rsp,%rax,4), %eax
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; X64-NEXT:    retq
   %t13 = insertelement <4 x i32> %t8, i32 76, i32 0
   %t9 = extractelement <4 x i32> %t13, i32 %t7
@@ -66,6 +70,7 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movdqa %xmm0, (%esp)
 ; X32-NEXT:    pinsrd $0, (%esp,%eax,4), %xmm0
 ; X32-NEXT:    movl %ebp, %esp
@@ -74,9 +79,10 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t2:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    pinsrd $0, -24(%rsp,%rax,4), %xmm0
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    pinsrd $0, -24(%rsp,%rdi,4), %xmm0
 ; X64-NEXT:    retq
   %t9 = extractelement <4 x i32> %t8, i32 %t7
   %t13 = insertelement <4 x i32> %t8, i32 %t9, i32 0
@@ -90,6 +96,7 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movss %xmm0, (%esp,%eax,4)
 ; X32-NEXT:    movaps (%esp), %xmm0
@@ -99,9 +106,10 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ;
 ; X64-LABEL: t3:
 ; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movss %xmm0, -24(%rsp,%rax,4)
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movss %xmm0, -24(%rsp,%rdi,4)
 ; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    retq
   %t9 = extractelement <4 x i32> %t8, i32 0
diff --git a/test/CodeGen/X86/vec_insert-4.ll b/test/CodeGen/X86/vec_insert-4.ll
index c847ac983003a..82627c54e663d 100644
--- a/test/CodeGen/X86/vec_insert-4.ll
+++ b/test/CodeGen/X86/vec_insert-4.ll
@@ -10,6 +10,7 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
 ; X32-NEXT:    andl $-32, %esp
 ; X32-NEXT:    subl $64, %esp
 ; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    andl $7, %eax
 ; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
@@ -25,10 +26,11 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
 ; X64-NEXT:    movq %rsp, %rbp
 ; X64-NEXT:    andq $-32, %rsp
 ; X64-NEXT:    subq $64, %rsp
+; X64-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; X64-NEXT:    movaps %xmm0, (%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl $1084227584, (%rsp,%rax,4) ## imm = 0x40A00000
+; X64-NEXT:    andl $7, %edi
+; X64-NEXT:    movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000
 ; X64-NEXT:    movaps (%rsp), %xmm0
 ; X64-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
 ; X64-NEXT:    movq %rbp, %rsp
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index d612e7eb10d35..4074b6d32353a 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -11,10 +11,11 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movl 12(%ebp), %ecx
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    andl $3, %eax
+; X32-NEXT:    movl 8(%ebp), %ecx
 ; X32-NEXT:    movaps %xmm0, (%esp)
-; X32-NEXT:    movl %eax, (%esp,%ecx,4)
+; X32-NEXT:    movl %ecx, (%esp,%eax,4)
 ; X32-NEXT:    movaps (%esp), %xmm0
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -22,9 +23,10 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
 ;
 ; X64-LABEL: var_insert:
 ; X64:       # BB#0: # %entry
+; X64-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %esi, %rax
-; X64-NEXT:    movl %edi, -24(%rsp,%rax,4)
+; X64-NEXT:    andl $3, %esi
+; X64-NEXT:    movl %edi, -24(%rsp,%rsi,4)
 ; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    retq
 entry:
@@ -40,6 +42,7 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
 ; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
 ; X32-NEXT:    movl (%esp,%eax,4), %eax
 ; X32-NEXT:    movl %ebp, %esp
@@ -48,9 +51,10 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
 ;
 ; X64-LABEL: var_extract:
 ; X64:       # BB#0: # %entry
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movl -24(%rsp,%rax,4), %eax
+; X64-NEXT:    andl $3, %edi
+; X64-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; X64-NEXT:    retq
 entry:
   %tmp3 = extractelement <4 x i32> %x, i32 %idx
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 6a81cdc490fe5..923af1216d058 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4818,3 +4818,63 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
  store <8 x float> %4, <8 x float>* %3, align 32
  ret void
 }
+
+define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2sdl %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i32_to_2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i32 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
+; SSE-LABEL: sitofp_i32_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2ssl %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i32_to_4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i32 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2sdq %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i64_to_2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i64 %a1 to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
+; SSE-LABEL: sitofp_i64_to_4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sitofp_i64_to_4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp i64 %a1 to float
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 440faa689fb8a..9f0d4a7d7264a 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -7,7 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -89,6 +90,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -193,6 +199,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
@@ -339,6 +350,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $12, %xmm1
@@ -515,6 +539,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -624,6 +656,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
@@ -669,6 +706,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -712,6 +755,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
@@ -907,6 +956,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1033,6 +1091,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
@@ -1114,6 +1177,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1207,6 +1275,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1367,6 +1447,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -1480,6 +1567,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq $7, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1514,6 +1606,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrad $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrad $5, %xmm0
@@ -1543,6 +1640,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psraw $3, %xmm0
@@ -1586,6 +1688,15 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 79902acfec24c..aee2857157b6b 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -5,6 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+
 ;
 ; Variable Shifts
 ;
@@ -74,6 +77,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -135,6 +143,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -228,6 +241,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -375,6 +401,42 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -435,6 +497,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -476,6 +543,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -517,6 +590,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -662,6 +741,44 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -724,6 +841,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -769,6 +891,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -844,6 +971,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -981,6 +1120,42 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -1033,6 +1208,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraq $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -1068,6 +1248,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrad $5, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -1103,6 +1288,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsraw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -1160,6 +1350,15 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 2c9e433cfb2ca..6cc98b5f3eeb1 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
 ;
 ; Variable Shifts
 ;
@@ -99,399 +100,36 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -590,399 +228,36 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    sarb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = ashr <64 x i8> %a, %splat
@@ -1080,252 +355,36 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    sarb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    sarb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
+; AVX512BW-NEXT:    vpsraw $4, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm3
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[10],zmm3[10],zmm0[11],zmm3[11],zmm0[12],zmm3[12],zmm0[13],zmm3[13],zmm0[14],zmm3[14],zmm0[15],zmm3[15],zmm0[24],zmm3[24],zmm0[25],zmm3[25],zmm0[26],zmm3[26],zmm0[27],zmm3[27],zmm0[28],zmm3[28],zmm0[29],zmm3[29],zmm0[30],zmm3[30],zmm0[31],zmm3[31],zmm0[40],zmm3[40],zmm0[41],zmm3[41],zmm0[42],zmm3[42],zmm0[43],zmm3[43],zmm0[44],zmm3[44],zmm0[45],zmm3[45],zmm0[46],zmm3[46],zmm0[47],zmm3[47],zmm0[56],zmm3[56],zmm0[57],zmm3[57],zmm0[58],zmm3[58],zmm0[59],zmm3[59],zmm0[60],zmm3[60],zmm0[61],zmm3[61],zmm0[62],zmm3[62],zmm0[63],zmm3[63]
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm3[0],zmm0[1],zmm3[1],zmm0[2],zmm3[2],zmm0[3],zmm3[3],zmm0[4],zmm3[4],zmm0[5],zmm3[5],zmm0[6],zmm3[6],zmm0[7],zmm3[7],zmm0[16],zmm3[16],zmm0[17],zmm3[17],zmm0[18],zmm3[18],zmm0[19],zmm3[19],zmm0[20],zmm3[20],zmm0[21],zmm3[21],zmm0[22],zmm3[22],zmm0[23],zmm3[23],zmm0[32],zmm3[32],zmm0[33],zmm3[33],zmm0[34],zmm3[34],zmm0[35],zmm3[35],zmm0[36],zmm3[36],zmm0[37],zmm3[37],zmm0[38],zmm3[38],zmm0[39],zmm3[39],zmm0[48],zmm3[48],zmm0[49],zmm3[49],zmm0[50],zmm3[50],zmm0[51],zmm3[51],zmm0[52],zmm3[52],zmm0[53],zmm3[53],zmm0[54],zmm3[54],zmm0[55],zmm3[55]
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index a7e1a531b6598..9b8c0def4558f 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -7,6 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -65,6 +67,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -162,6 +169,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
@@ -308,6 +320,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $12, %xmm1
@@ -433,6 +458,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $5, %xmm1
@@ -492,6 +525,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
@@ -533,6 +571,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -576,6 +620,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
@@ -709,6 +759,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -798,6 +857,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -872,6 +936,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -965,6 +1034,18 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1071,6 +1152,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -1131,6 +1219,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq $7, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlq $7, %xmm0
@@ -1160,6 +1253,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrld $5, %xmm0
@@ -1189,6 +1287,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlw $3, %xmm0
@@ -1223,6 +1326,12 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psrlw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 25667e7d1661a..58bb8f3e6ec01 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -5,6 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+
 ;
 ; Variable Shifts
 ;
@@ -51,6 +54,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -112,6 +120,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -205,6 +218,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -307,6 +333,30 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -346,6 +396,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = lshr <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -387,6 +442,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -428,6 +489,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -532,6 +599,32 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -579,6 +672,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -624,6 +722,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -699,6 +802,18 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -795,6 +910,30 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -834,6 +973,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -869,6 +1013,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrld $5, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -904,6 +1053,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -947,6 +1101,12 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 3da8f9437e575..905445f30162a 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -79,399 +79,21 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -553,399 +175,21 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shrb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = lshr <64 x i8> %a, %splat
@@ -1026,252 +270,21 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    shrb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    shrb %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 8706078b40c99..32334420f8b22 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -7,6 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -63,6 +65,11 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -128,6 +135,11 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pslld $23, %xmm1
@@ -263,6 +275,19 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: var_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $12, %xmm1
@@ -383,6 +408,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: var_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: var_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $5, %xmm1
@@ -441,6 +474,11 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm0
@@ -482,6 +520,12 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpslld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -525,6 +569,12 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
@@ -651,6 +701,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatvar_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatvar_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -737,6 +796,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -792,6 +856,11 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
@@ -836,6 +905,16 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
+; AVX512DQVL-LABEL: constant_shift_v8i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
@@ -925,6 +1004,13 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: constant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: constant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
@@ -984,6 +1070,11 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; AVX512-NEXT:    vpsllq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq $7, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllq $7, %xmm0
@@ -1013,6 +1104,11 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pslld $5, %xmm0
@@ -1042,6 +1138,11 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $3, %xmm0
@@ -1074,6 +1175,12 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
+; AVX512VL-LABEL: splatconstant_shift_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    psllw $3, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index a1ef2791c1b03..104fa089c7449 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -5,6 +5,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
 
 ;
 ; Variable Shifts
@@ -49,6 +51,11 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
@@ -93,6 +100,11 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: var_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
@@ -180,6 +192,19 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
@@ -271,6 +296,29 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: var_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: var_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
@@ -310,6 +358,11 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = shl <4 x i64> %a, %splat
   ret <4 x i64> %shift
@@ -351,6 +404,12 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i32> %a, %splat
   ret <8 x i32> %shift
@@ -392,6 +451,12 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatvar_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i16> %a, %splat
   ret <16 x i16> %shift
@@ -487,6 +552,31 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BWVL-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
   ret <32 x i8> %shift
@@ -531,6 +621,11 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
   ret <4 x i64> %shift
 }
@@ -566,6 +661,11 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
   ret <8 x i32> %shift
 }
@@ -609,6 +709,16 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v16i16:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
 }
@@ -698,6 +808,29 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512DQVL-LABEL: constant_shift_v32i8:
+; AVX512DQVL:       # BB#0:
+; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
+; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    retq
+;
+; AVX512BWVL-LABEL: constant_shift_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
@@ -737,6 +870,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
@@ -772,6 +910,11 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpslld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpslld $5, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
@@ -807,6 +950,11 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
@@ -849,6 +997,12 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_shift_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index b9c9b56427f18..180d6f3a3b03a 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -76,399 +76,19 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -547,399 +167,19 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
-; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    movzbl %dl, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm5
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %sil, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %sil
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    movzbl %sil, %ecx
-; AVX512BW-NEXT:    vmovd %ecx, %xmm4
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %dl
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    movzbl %dl, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
-; AVX512BW-NEXT:    shlb %cl, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
   %shift = shl <64 x i8> %a, %splat
@@ -1013,252 +253,19 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm2
-; AVX512BW-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm3
-; AVX512BW-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $7, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $6, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $5, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $4, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $3, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT:    shlb $2, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT:    addb %al, %al
-; AVX512BW-NEXT:    movzbl %al, %eax
-; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllw $5, {{.*}}(%rip), %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
new file mode 100644
index 0000000000000..defc3e918b24e
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefix=KNL
+
+;expand 128 -> 256 include <4 x float> <2 x double>
+define <8 x float> @expand(<4 x float> %a) {
+; SKX-LABEL: expand:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $5, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
+   ret <8 x float> %res
+}
+
+define <8 x float> @expand1(<4 x float> %a ) {
+; SKX-LABEL: expand1:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-86, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand1:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
+; KNL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+   ret <8 x float> %res
+}
+
+;Expand 128 -> 256 test <2 x double> -> <4 x double>
+define <4 x double> @expand2(<2 x double> %a) {
+; SKX-LABEL: expand2:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand2:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
+   ret <4 x double> %res
+}
+
+;expand 128 -> 256 include case <4 x i32> <8 x i32>
+define <8 x i32> @expand3(<4 x i32> %a ) {
+; SKX-LABEL: expand3:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand3:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
+   ret <8 x i32> %res
+}
+
+;expand 128 -> 256 include case <2 x i64> <4 x i64>
+define <4 x i64> @expand4(<2 x i64> %a ) {
+; SKX-LABEL: expand4:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand4:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
+   ret <4 x i64> %res
+}
+
+;Negative test for 128-> 256
+define <8 x float> @expand5(<4 x float> %a ) {
+; SKX-LABEL: expand5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vbroadcastss %xmm0, %ymm0
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand5:
+; KNL:       # BB#0:
+; KNL-NEXT:    vbroadcastss %xmm0, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+   ret <8 x float> %res
+}
+
+;expand 256 -> 512 include <8 x float> <16 x float>
+define <8 x float> @expand6(<4 x float> %a ) {
+; SKX-LABEL: expand6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vinsertf{{.*}}$1, %xmm0, %ymm1, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand6:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   ret <8 x float> %res
+}
+
+define <16 x float> @expand7(<8 x float> %a) {
+; SKX-LABEL: expand7:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $1285, %ax # imm = 0x505
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand7:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $1285, %ax # imm = 0x505
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand8(<8 x float> %a ) {
+; SKX-LABEL: expand8:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand8:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x float> %res
+}
+
+;expand 256 -> 512 include <4 x double> <8 x double>
+define <8 x double> @expand9(<4 x double> %a) {
+; SKX-LABEL: expand9:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand9:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x double> %res
+}
+
+define <16 x i32> @expand10(<8 x i32> %a ) {
+; SKX-LABEL: expand10:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand10:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x i32> %res
+}
+
+define <8 x i64> @expand11(<4 x i64> %a) {
+; SKX-LABEL: expand11:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand11:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x i64> %res
+}
+
+;Negative test for 256-> 512
+define <16 x float> @expand12(<8 x float> %a) {
+; SKX-LABEL: expand12:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; SKX-NEXT:    vxorps %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand12:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand13(<8 x float> %a ) {
+; SKX-LABEL: expand13:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand13:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   ret <16 x float> %res
+}
+
+; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
+
+define <8 x float> @expand14(<4 x float> %a) {
+; SKX-LABEL: expand14:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $20, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand14:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}
+
+;Negative test.
+define <8 x float> @expand15(<4 x float> %a) {
+; SKX-LABEL: expand15:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
+; SKX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
+; SKX-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand15:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 23e40a6572af1..b79df1facfa17 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -91,6 +91,20 @@ define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %res1
 }
 
+define <4 x float> @combine_vpermil2ps_02zu(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: combine_vpermil2ps_02zu:
+; X32:       # BB#0:
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_vpermil2ps_02zu:
+; X64:       # BB#0:
+; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X64-NEXT:    retq
+  %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>, i8 0)
+  ret <4 x float> %res0
+}
+
 define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
 ; X32-LABEL: combine_vpermil2ps256_identity:
 ; X32:       # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
index d130e7ff00b22..70b7fb16fc25e 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -12,6 +12,8 @@
 define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
 ; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
 ; SSE:       # BB#0:
+; SSE-NEXT:    andl $1, %esi
+; SSE-NEXT:    andl $1, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -19,6 +21,8 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
 ;
 ; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
 ; AVX:       # BB#0:
+; AVX-NEXT:    andl $1, %esi
+; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -33,9 +37,11 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
 define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
 ; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movslq %edi, %rax
+; SSE-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT:    andl $1, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movslq %esi, %rcx
+; SSE-NEXT:    andl $1, %esi
 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -43,9 +49,11 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
 ;
 ; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
+; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %esi, %rcx
+; AVX-NEXT:    andl $1, %esi
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -60,11 +68,15 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
 define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
 ; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movslq %edi, %rax
-; SSE2-NEXT:    movslq %esi, %rsi
-; SSE2-NEXT:    movslq %edx, %rdx
+; SSE2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movslq %ecx, %rcx
+; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -76,11 +88,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ;
 ; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movslq %edi, %rax
-; SSSE3-NEXT:    movslq %esi, %rsi
-; SSSE3-NEXT:    movslq %edx, %rdx
+; SSSE3-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT:    andl $3, %edi
+; SSSE3-NEXT:    andl $3, %esi
+; SSSE3-NEXT:    andl $3, %edx
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movslq %ecx, %rcx
+; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -92,11 +108,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ;
 ; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movslq %edi, %rax
-; SSE41-NEXT:    movslq %esi, %rsi
-; SSE41-NEXT:    movslq %edx, %rdx
+; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT:    andl $3, %edi
+; SSE41-NEXT:    andl $3, %esi
+; SSE41-NEXT:    andl $3, %edx
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movslq %ecx, %rcx
+; SSE41-NEXT:    andl $3, %ecx
 ; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -105,11 +125,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ;
 ; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
-; AVX-NEXT:    movslq %esi, %rsi
-; AVX-NEXT:    movslq %edx, %rdx
+; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $3, %edi
+; AVX-NEXT:    andl $3, %esi
+; AVX-NEXT:    andl $3, %edx
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %ecx, %rcx
+; AVX-NEXT:    andl $3, %ecx
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -129,11 +153,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
 ; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movslq %edi, %rax
-; SSE2-NEXT:    movslq %esi, %rsi
-; SSE2-NEXT:    movslq %edx, %rdx
+; SSE2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movslq %ecx, %rcx
+; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -145,11 +173,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ;
 ; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movslq %edi, %rax
-; SSSE3-NEXT:    movslq %esi, %rsi
-; SSSE3-NEXT:    movslq %edx, %rdx
+; SSSE3-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT:    andl $3, %edi
+; SSSE3-NEXT:    andl $3, %esi
+; SSSE3-NEXT:    andl $3, %edx
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movslq %ecx, %rcx
+; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -161,11 +193,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ;
 ; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movslq %edi, %rax
-; SSE41-NEXT:    movslq %esi, %rsi
-; SSE41-NEXT:    movslq %edx, %rdx
+; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT:    andl $3, %edi
+; SSE41-NEXT:    andl $3, %esi
+; SSE41-NEXT:    andl $3, %edx
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movslq %ecx, %rcx
+; SSE41-NEXT:    andl $3, %ecx
 ; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rsi,4), %xmm0
 ; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
@@ -174,11 +210,15 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ;
 ; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
-; AVX-NEXT:    movslq %esi, %rsi
-; AVX-NEXT:    movslq %edx, %rdx
+; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $3, %edi
+; AVX-NEXT:    andl $3, %esi
+; AVX-NEXT:    andl $3, %edx
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %ecx, %rcx
+; AVX-NEXT:    andl $3, %ecx
 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
@@ -204,34 +244,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE2-NEXT:    movswq %di, %rax
-; SSE2-NEXT:    movswq %si, %rsi
-; SSE2-NEXT:    movswq %dx, %rdx
-; SSE2-NEXT:    movswq %cx, %r10
-; SSE2-NEXT:    movswq %r8w, %r11
+; SSE2-NEXT:    andl $7, %edi
+; SSE2-NEXT:    andl $7, %esi
+; SSE2-NEXT:    andl $7, %edx
+; SSE2-NEXT:    andl $7, %ecx
+; SSE2-NEXT:    andl $7, %r8d
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movswq %r9w, %r8
-; SSE2-NEXT:    movswq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movswq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSE2-NEXT:    andl $7, %r9d
+; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT:    andl $7, %r10d
+; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $7, %eax
+; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
 ; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
 ; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movd %r10d, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
+; SSE2-NEXT:    movd %edx, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%r11,2), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %edx
+; SSE2-NEXT:    movd %edx, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movd %ecx, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSE2-NEXT:    movd %esi, %xmm1
-; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %eax
+; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -246,34 +288,36 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSSE3-NEXT:    movswq %di, %rax
-; SSSE3-NEXT:    movswq %si, %rsi
-; SSSE3-NEXT:    movswq %dx, %rdx
-; SSSE3-NEXT:    movswq %cx, %r10
-; SSSE3-NEXT:    movswq %r8w, %r11
+; SSSE3-NEXT:    andl $7, %edi
+; SSSE3-NEXT:    andl $7, %esi
+; SSSE3-NEXT:    andl $7, %edx
+; SSSE3-NEXT:    andl $7, %ecx
+; SSSE3-NEXT:    andl $7, %r8d
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movswq %r9w, %r8
-; SSSE3-NEXT:    movswq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    movswq {{[0-9]+}}(%rsp), %rdi
-; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSSE3-NEXT:    andl $7, %r9d
+; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT:    andl $7, %r10d
+; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $7, %eax
+; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movd %r10d, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
+; SSSE3-NEXT:    movd %edx, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%r11,2), %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %edx
+; SSSE3-NEXT:    movd %edx, %xmm2
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    movd %edi, %xmm1
+; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    movd %ecx, %xmm2
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSSE3-NEXT:    movd %esi, %xmm1
-; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -282,68 +326,66 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ;
 ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
 ; SSE41-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
 ; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT:    movswq %di, %rax
-; SSE41-NEXT:    movswq %si, %rbx
-; SSE41-NEXT:    movswq %dx, %r11
-; SSE41-NEXT:    movswq %cx, %r10
-; SSE41-NEXT:    movswq %r8w, %rdi
+; SSE41-NEXT:    andl $7, %edi
+; SSE41-NEXT:    andl $7, %esi
+; SSE41-NEXT:    andl $7, %edx
+; SSE41-NEXT:    andl $7, %ecx
+; SSE41-NEXT:    andl $7, %r8d
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movswq %r9w, %rcx
-; SSE41-NEXT:    movswq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT:    movswq {{[0-9]+}}(%rsp), %rsi
-; SSE41-NEXT:    movzwl -16(%rsp,%rdx,2), %edx
-; SSE41-NEXT:    movzwl -16(%rsp,%rsi,2), %esi
-; SSE41-NEXT:    movzwl -16(%rsp,%rax,2), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pinsrw $1, -16(%rsp,%rbx,2), %xmm0
-; SSE41-NEXT:    pinsrw $2, -16(%rsp,%r11,2), %xmm0
-; SSE41-NEXT:    pinsrw $3, -16(%rsp,%r10,2), %xmm0
-; SSE41-NEXT:    pinsrw $4, -16(%rsp,%rdi,2), %xmm0
-; SSE41-NEXT:    pinsrw $5, -16(%rsp,%rcx,2), %xmm0
-; SSE41-NEXT:    pinsrw $6, %edx, %xmm0
-; SSE41-NEXT:    pinsrw $7, %esi, %xmm0
-; SSE41-NEXT:    popq %rbx
+; SSE41-NEXT:    andl $7, %r9d
+; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE41-NEXT:    andl $7, %r10d
+; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    andl $7, %eax
+; SSE41-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; SSE41-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE41-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm0
+; SSE41-NEXT:    pinsrw $2, -24(%rsp,%rdx,2), %xmm0
+; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm0
+; SSE41-NEXT:    pinsrw $4, -24(%rsp,%r8,2), %xmm0
+; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm0
+; SSE41-NEXT:    pinsrw $6, %r10d, %xmm0
+; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
 ; AVX-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
 ; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX-NEXT:    movswq %di, %r10
-; AVX-NEXT:    movswq %si, %r11
-; AVX-NEXT:    movswq %dx, %r14
-; AVX-NEXT:    movswq %cx, %rcx
-; AVX-NEXT:    movswq %r8w, %rdi
+; AVX-NEXT:    andl $7, %edi
+; AVX-NEXT:    andl $7, %esi
+; AVX-NEXT:    andl $7, %edx
+; AVX-NEXT:    andl $7, %ecx
+; AVX-NEXT:    andl $7, %r8d
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movswq %r9w, %rax
-; AVX-NEXT:    movswq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT:    movswq {{[0-9]+}}(%rsp), %rdx
-; AVX-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; AVX-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
-; AVX-NEXT:    movzwl -24(%rsp,%r10,2), %ebx
-; AVX-NEXT:    vmovd %ebx, %xmm0
-; AVX-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $2, -24(%rsp,%r14,2), %xmm0, %xmm0
+; AVX-NEXT:    andl $7, %r9d
+; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT:    andl $7, %r10d
+; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; AVX-NEXT:    andl $7, %eax
+; AVX-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; AVX-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $4, -24(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    popq %r14
+; AVX-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $6, %r10d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x0 = extractelement <8 x i16> %x, i16 %i0
   %x1 = extractelement <8 x i16> %x, i16 %i1
@@ -374,54 +416,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT:    andl $15, %r10d
 ; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
 ; SSE2-NEXT:    movzbl (%r10,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm15
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm8
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
-; SSE2-NEXT:    movsbq %dl, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movsbq %dil, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %edi
+; SSE2-NEXT:    movzbl (%rdi,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm11
-; SSE2-NEXT:    movsbq %r8b, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %r8d
+; SSE2-NEXT:    movzbl (%r8,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm7
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm12
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
-; SSE2-NEXT:    movsbq %cl, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl (%rcx,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm14
-; SSE2-NEXT:    movsbq %sil, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %esi
+; SSE2-NEXT:    movzbl (%rsi,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movsbq %r9b, %rax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    andl $15, %r9d
+; SSE2-NEXT:    movzbl (%r9,%r11), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -449,54 +501,64 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSSE3-NEXT:    andl $15, %r10d
 ; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
 ; SSSE3-NEXT:    movzbl (%r10,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm15
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm8
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm9
-; SSSE3-NEXT:    movsbq %dl, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm10
-; SSSE3-NEXT:    movsbq %dil, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %edi
+; SSSE3-NEXT:    movzbl (%rdi,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm11
-; SSSE3-NEXT:    movsbq %r8b, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %r8d
+; SSSE3-NEXT:    movzbl (%r8,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm7
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm12
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm13
-; SSSE3-NEXT:    movsbq %cl, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl (%rcx,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm6
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm14
-; SSSE3-NEXT:    movsbq %sil, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %esi
+; SSSE3-NEXT:    movzbl (%rsi,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm5
-; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    movsbq %r9b, %rax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    andl $15, %r9d
+; SSSE3-NEXT:    movzbl (%r9,%r11), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -520,7 +582,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE41-NEXT:    pushq %rbp
 ; SSE41-NEXT:    pushq %r15
 ; SSE41-NEXT:    pushq %r14
-; SSE41-NEXT:    pushq %r13
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
@@ -529,54 +590,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT:    movsbq %dil, %r15
-; SSE41-NEXT:    movsbq %sil, %r14
-; SSE41-NEXT:    movsbq %dl, %r11
-; SSE41-NEXT:    movsbq %cl, %r10
-; SSE41-NEXT:    movsbq %r8b, %r8
+; SSE41-NEXT:    andl $15, %edi
+; SSE41-NEXT:    andl $15, %esi
+; SSE41-NEXT:    andl $15, %edx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    andl $15, %r8d
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movsbq %r9b, %r9
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r12
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r13
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbx
+; SSE41-NEXT:    andl $15, %r9d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE41-NEXT:    andl $15, %r10d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; SSE41-NEXT:    andl $15, %r11d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE41-NEXT:    andl $15, %r14d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE41-NEXT:    andl $15, %r15d
 ; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT:    movzbl (%r15,%rax), %ecx
-; SSE41-NEXT:    movd %ecx, %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r15
-; SSE41-NEXT:    pinsrb $1, (%r14,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r14
-; SSE41-NEXT:    pinsrb $2, (%r11,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r11
-; SSE41-NEXT:    pinsrb $3, (%r10,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT:    movzbl (%rdi,%rax), %edi
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; SSE41-NEXT:    andl $15, %r12d
+; SSE41-NEXT:    pinsrb $1, (%rsi,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; SSE41-NEXT:    andl $15, %esi
+; SSE41-NEXT:    pinsrb $2, (%rdx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT:    andl $15, %edx
+; SSE41-NEXT:    pinsrb $3, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
 ; SSE41-NEXT:    pinsrb $4, (%r8,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; SSE41-NEXT:    andl $15, %ebx
 ; SSE41-NEXT:    pinsrb $5, (%r9,%rax), %xmm0
-; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT:    movzbl (%r12,%rax), %esi
-; SSE41-NEXT:    movzbl (%r13,%rax), %edi
-; SSE41-NEXT:    movzbl (%rbp,%rax), %ebp
-; SSE41-NEXT:    movzbl (%rbx,%rax), %ebx
-; SSE41-NEXT:    movzbl (%r15,%rax), %r8d
-; SSE41-NEXT:    movzbl (%r14,%rax), %r9d
-; SSE41-NEXT:    movzbl (%r11,%rax), %r11d
-; SSE41-NEXT:    movzbl (%r10,%rax), %r10d
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; SSE41-NEXT:    andl $15, %edi
+; SSE41-NEXT:    movzbl (%r10,%rax), %r8d
+; SSE41-NEXT:    movzbl (%r11,%rax), %r9d
+; SSE41-NEXT:    movzbl (%r14,%rax), %r10d
+; SSE41-NEXT:    movzbl (%r15,%rax), %r11d
+; SSE41-NEXT:    movzbl (%r12,%rax), %ebp
+; SSE41-NEXT:    movzbl (%rsi,%rax), %esi
+; SSE41-NEXT:    movzbl (%rdx,%rax), %edx
 ; SSE41-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE41-NEXT:    movzbl (%rdx,%rax), %eax
-; SSE41-NEXT:    pinsrb $6, %esi, %xmm0
-; SSE41-NEXT:    pinsrb $7, %edi, %xmm0
-; SSE41-NEXT:    pinsrb $8, %ebp, %xmm0
-; SSE41-NEXT:    pinsrb $9, %ebx, %xmm0
-; SSE41-NEXT:    pinsrb $10, %r8d, %xmm0
-; SSE41-NEXT:    pinsrb $11, %r9d, %xmm0
-; SSE41-NEXT:    pinsrb $12, %r11d, %xmm0
-; SSE41-NEXT:    pinsrb $13, %r10d, %xmm0
-; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT:    movzbl (%rbx,%rax), %ebx
+; SSE41-NEXT:    movzbl (%rdi,%rax), %eax
+; SSE41-NEXT:    pinsrb $6, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $7, %r9d, %xmm0
+; SSE41-NEXT:    pinsrb $8, %r10d, %xmm0
+; SSE41-NEXT:    pinsrb $9, %r11d, %xmm0
+; SSE41-NEXT:    pinsrb $10, %ebp, %xmm0
+; SSE41-NEXT:    pinsrb $11, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $12, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $14, %ebx, %xmm0
 ; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
-; SSE41-NEXT:    popq %r13
 ; SSE41-NEXT:    popq %r14
 ; SSE41-NEXT:    popq %r15
 ; SSE41-NEXT:    popq %rbp
@@ -587,7 +657,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; AVX-NEXT:    pushq %rbp
 ; AVX-NEXT:    pushq %r15
 ; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %r13
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
@@ -596,54 +665,63 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX-NEXT:    movsbq %dil, %r10
-; AVX-NEXT:    movsbq %sil, %r11
-; AVX-NEXT:    movsbq %dl, %r14
-; AVX-NEXT:    movsbq %cl, %r15
-; AVX-NEXT:    movsbq %r8b, %r8
+; AVX-NEXT:    andl $15, %edi
+; AVX-NEXT:    andl $15, %esi
+; AVX-NEXT:    andl $15, %edx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    andl $15, %r8d
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movsbq %r9b, %r9
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r12
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r13
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rdi
-; AVX-NEXT:    movzbl (%r10,%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT:    vpinsrb $1, (%r11,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r11
-; AVX-NEXT:    vpinsrb $2, (%r14,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r14
-; AVX-NEXT:    vpinsrb $3, (%r15,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r15
-; AVX-NEXT:    vpinsrb $4, (%r8,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r8
-; AVX-NEXT:    vpinsrb $5, (%r9,%rdi), %xmm0, %xmm0
-; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT:    movzbl (%r12,%rdi), %edx
-; AVX-NEXT:    movzbl (%r13,%rdi), %ebx
-; AVX-NEXT:    movzbl (%rbp,%rdi), %ebp
-; AVX-NEXT:    movzbl (%rcx,%rdi), %ecx
-; AVX-NEXT:    movzbl (%r10,%rdi), %eax
-; AVX-NEXT:    movzbl (%r11,%rdi), %r9d
-; AVX-NEXT:    movzbl (%r14,%rdi), %r10d
-; AVX-NEXT:    movzbl (%r15,%rdi), %r11d
-; AVX-NEXT:    movzbl (%r8,%rdi), %r8d
-; AVX-NEXT:    movzbl (%rsi,%rdi), %esi
-; AVX-NEXT:    vpinsrb $6, %edx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, %ebx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, %r9d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, %esi, %xmm0, %xmm0
+; AVX-NEXT:    andl $15, %r9d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX-NEXT:    andl $15, %r10d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX-NEXT:    andl $15, %r11d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX-NEXT:    andl $15, %r14d
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX-NEXT:    andl $15, %r15d
+; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movzbl (%rdi,%rax), %edi
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX-NEXT:    andl $15, %r12d
+; AVX-NEXT:    vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; AVX-NEXT:    andl $15, %esi
+; AVX-NEXT:    vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; AVX-NEXT:    andl $15, %edx
+; AVX-NEXT:    vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $4, (%r8,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX-NEXT:    andl $15, %ebx
+; AVX-NEXT:    vpinsrb $5, (%r9,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; AVX-NEXT:    andl $15, %edi
+; AVX-NEXT:    movzbl (%r10,%rax), %r8d
+; AVX-NEXT:    movzbl (%r11,%rax), %r9d
+; AVX-NEXT:    movzbl (%r14,%rax), %r10d
+; AVX-NEXT:    movzbl (%r15,%rax), %r11d
+; AVX-NEXT:    movzbl (%r12,%rax), %ebp
+; AVX-NEXT:    movzbl (%rsi,%rax), %esi
+; AVX-NEXT:    movzbl (%rdx,%rax), %edx
+; AVX-NEXT:    movzbl (%rcx,%rax), %ecx
+; AVX-NEXT:    movzbl (%rbx,%rax), %ebx
+; AVX-NEXT:    movzbl (%rdi,%rax), %eax
+; AVX-NEXT:    vpinsrb $6, %r8d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $7, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $11, %esi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $14, %ebx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
-; AVX-NEXT:    popq %r13
 ; AVX-NEXT:    popq %r14
 ; AVX-NEXT:    popq %r15
 ; AVX-NEXT:    popq %rbp
@@ -690,11 +768,15 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
 ; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movslq (%rdi), %rax
+; SSE2-NEXT:    movl (%rdi), %eax
+; SSE2-NEXT:    movl 4(%rdi), %ecx
+; SSE2-NEXT:    andl $3, %eax
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movslq 4(%rdi), %rcx
-; SSE2-NEXT:    movslq 8(%rdi), %rdx
-; SSE2-NEXT:    movslq 12(%rdi), %rsi
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    movl 8(%rdi), %edx
+; SSE2-NEXT:    andl $3, %edx
+; SSE2-NEXT:    movl 12(%rdi), %esi
+; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -706,11 +788,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ;
 ; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movslq (%rdi), %rax
+; SSSE3-NEXT:    movl (%rdi), %eax
+; SSSE3-NEXT:    movl 4(%rdi), %ecx
+; SSSE3-NEXT:    andl $3, %eax
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movslq 4(%rdi), %rcx
-; SSSE3-NEXT:    movslq 8(%rdi), %rdx
-; SSSE3-NEXT:    movslq 12(%rdi), %rsi
+; SSSE3-NEXT:    andl $3, %ecx
+; SSSE3-NEXT:    movl 8(%rdi), %edx
+; SSSE3-NEXT:    andl $3, %edx
+; SSSE3-NEXT:    movl 12(%rdi), %esi
+; SSSE3-NEXT:    andl $3, %esi
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -722,11 +808,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ;
 ; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movslq (%rdi), %rax
+; SSE41-NEXT:    movl (%rdi), %eax
+; SSE41-NEXT:    movl 4(%rdi), %ecx
+; SSE41-NEXT:    andl $3, %eax
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movslq 4(%rdi), %rcx
-; SSE41-NEXT:    movslq 8(%rdi), %rdx
-; SSE41-NEXT:    movslq 12(%rdi), %rsi
+; SSE41-NEXT:    andl $3, %ecx
+; SSE41-NEXT:    movl 8(%rdi), %edx
+; SSE41-NEXT:    andl $3, %edx
+; SSE41-NEXT:    movl 12(%rdi), %esi
+; SSE41-NEXT:    andl $3, %esi
 ; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rcx,4), %xmm0
 ; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
@@ -735,11 +825,15 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ;
 ; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq (%rdi), %rax
+; AVX-NEXT:    movl (%rdi), %eax
+; AVX-NEXT:    movl 4(%rdi), %ecx
+; AVX-NEXT:    andl $3, %eax
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq 4(%rdi), %rcx
-; AVX-NEXT:    movslq 8(%rdi), %rdx
-; AVX-NEXT:    movslq 12(%rdi), %rsi
+; AVX-NEXT:    andl $3, %ecx
+; AVX-NEXT:    movl 8(%rdi), %edx
+; AVX-NEXT:    andl $3, %edx
+; AVX-NEXT:    movl 12(%rdi), %esi
+; AVX-NEXT:    andl $3, %esi
 ; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
@@ -767,55 +861,71 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
 ; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movsbq (%rdi), %rcx
+; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    movsbq 8(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm8
-; SSE2-NEXT:    movsbq 12(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm9
-; SSE2-NEXT:    movsbq 4(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    movsbq 14(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm10
-; SSE2-NEXT:    movsbq 6(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm5
-; SSE2-NEXT:    movsbq 10(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm11
-; SSE2-NEXT:    movsbq 2(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm7
-; SSE2-NEXT:    movsbq 15(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm12
-; SSE2-NEXT:    movsbq 7(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    movsbq 11(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm13
-; SSE2-NEXT:    movsbq 3(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm6
-; SSE2-NEXT:    movsbq 13(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm14
-; SSE2-NEXT:    movsbq 5(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm4
-; SSE2-NEXT:    movsbq 9(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm15
-; SSE2-NEXT:    movsbq 1(%rdi), %rcx
-; SSE2-NEXT:    movzbl (%rcx,%rax), %eax
+; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzbl 8(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm8
+; SSE2-NEXT:    movzbl 12(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    movzbl 4(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movzbl 14(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:    movzbl 6(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    movzbl 10(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm11
+; SSE2-NEXT:    movzbl 2(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm7
+; SSE2-NEXT:    movzbl 15(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm12
+; SSE2-NEXT:    movzbl 7(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzbl 11(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm13
+; SSE2-NEXT:    movzbl 3(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:    movzbl 13(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm14
+; SSE2-NEXT:    movzbl 5(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movzbl 9(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE2-NEXT:    movd %eax, %xmm15
+; SSE2-NEXT:    movzbl 1(%rdi), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -836,55 +946,71 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ;
 ; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movsbq (%rdi), %rcx
+; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movsbq 8(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm8
-; SSSE3-NEXT:    movsbq 12(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm9
-; SSSE3-NEXT:    movsbq 4(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    movsbq 14(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm10
-; SSSE3-NEXT:    movsbq 6(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm5
-; SSSE3-NEXT:    movsbq 10(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm11
-; SSSE3-NEXT:    movsbq 2(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm7
-; SSSE3-NEXT:    movsbq 15(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm12
-; SSSE3-NEXT:    movsbq 7(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    movsbq 11(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm13
-; SSSE3-NEXT:    movsbq 3(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm6
-; SSSE3-NEXT:    movsbq 13(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm14
-; SSSE3-NEXT:    movsbq 5(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm4
-; SSSE3-NEXT:    movsbq 9(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm15
-; SSSE3-NEXT:    movsbq 1(%rdi), %rcx
-; SSSE3-NEXT:    movzbl (%rcx,%rax), %eax
+; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzbl 8(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm8
+; SSSE3-NEXT:    movzbl 12(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm9
+; SSSE3-NEXT:    movzbl 4(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm3
+; SSSE3-NEXT:    movzbl 14(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm10
+; SSSE3-NEXT:    movzbl 6(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm5
+; SSSE3-NEXT:    movzbl 10(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm11
+; SSSE3-NEXT:    movzbl 2(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm7
+; SSSE3-NEXT:    movzbl 15(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm12
+; SSSE3-NEXT:    movzbl 7(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzbl 11(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm13
+; SSSE3-NEXT:    movzbl 3(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm6
+; SSSE3-NEXT:    movzbl 13(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm14
+; SSSE3-NEXT:    movzbl 5(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm4
+; SSSE3-NEXT:    movzbl 9(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
+; SSSE3-NEXT:    movd %eax, %xmm15
+; SSSE3-NEXT:    movzbl 1(%rdi), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
@@ -911,55 +1037,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; SSE41-NEXT:    pushq %r13
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
-; SSE41-NEXT:    movsbq (%rdi), %rax
+; SSE41-NEXT:    movzbl (%rdi), %r11d
+; SSE41-NEXT:    andl $15, %r11d
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movsbq 1(%rdi), %r15
-; SSE41-NEXT:    movsbq 2(%rdi), %r8
-; SSE41-NEXT:    movsbq 3(%rdi), %r9
-; SSE41-NEXT:    movsbq 4(%rdi), %r10
-; SSE41-NEXT:    movsbq 5(%rdi), %r11
-; SSE41-NEXT:    movsbq 6(%rdi), %r14
-; SSE41-NEXT:    movsbq 7(%rdi), %r12
-; SSE41-NEXT:    movsbq 8(%rdi), %r13
-; SSE41-NEXT:    movsbq 9(%rdi), %rdx
-; SSE41-NEXT:    movsbq 10(%rdi), %rcx
-; SSE41-NEXT:    movsbq 11(%rdi), %rsi
-; SSE41-NEXT:    movsbq 12(%rdi), %rbx
+; SSE41-NEXT:    movzbl 1(%rdi), %r9d
+; SSE41-NEXT:    andl $15, %r9d
+; SSE41-NEXT:    movzbl 2(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE41-NEXT:    movzbl 3(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE41-NEXT:    movzbl 4(%rdi), %r14d
+; SSE41-NEXT:    andl $15, %r14d
+; SSE41-NEXT:    movzbl 5(%rdi), %r15d
+; SSE41-NEXT:    andl $15, %r15d
+; SSE41-NEXT:    movzbl 6(%rdi), %r12d
+; SSE41-NEXT:    andl $15, %r12d
+; SSE41-NEXT:    movzbl 7(%rdi), %r13d
+; SSE41-NEXT:    andl $15, %r13d
+; SSE41-NEXT:    movzbl 8(%rdi), %r8d
+; SSE41-NEXT:    andl $15, %r8d
+; SSE41-NEXT:    movzbl 9(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    movzbl 10(%rdi), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    movzbl 11(%rdi), %edx
+; SSE41-NEXT:    andl $15, %edx
+; SSE41-NEXT:    movzbl 12(%rdi), %esi
+; SSE41-NEXT:    andl $15, %esi
 ; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    movsbq 13(%rdi), %rax
-; SSE41-NEXT:    pinsrb $1, (%r15,%rbp), %xmm0
-; SSE41-NEXT:    movsbq 14(%rdi), %r15
-; SSE41-NEXT:    movsbq 15(%rdi), %rdi
-; SSE41-NEXT:    movzbl (%rdi,%rbp), %edi
-; SSE41-NEXT:    movzbl (%r15,%rbp), %r15d
-; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
-; SSE41-NEXT:    movzbl (%rbx,%rbp), %ebx
+; SSE41-NEXT:    movzbl (%r11,%rbp), %ebx
+; SSE41-NEXT:    movd %ebx, %xmm0
+; SSE41-NEXT:    movzbl 13(%rdi), %r11d
+; SSE41-NEXT:    andl $15, %r11d
+; SSE41-NEXT:    pinsrb $1, (%r9,%rbp), %xmm0
+; SSE41-NEXT:    movzbl 14(%rdi), %ebx
+; SSE41-NEXT:    andl $15, %ebx
+; SSE41-NEXT:    movzbl 15(%rdi), %edi
+; SSE41-NEXT:    andl $15, %edi
+; SSE41-NEXT:    movzbl (%rdi,%rbp), %r10d
+; SSE41-NEXT:    movzbl (%rbx,%rbp), %r9d
+; SSE41-NEXT:    movzbl (%r11,%rbp), %r11d
 ; SSE41-NEXT:    movzbl (%rsi,%rbp), %esi
-; SSE41-NEXT:    movzbl (%rcx,%rbp), %ecx
 ; SSE41-NEXT:    movzbl (%rdx,%rbp), %edx
+; SSE41-NEXT:    movzbl (%rcx,%rbp), %ecx
+; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
+; SSE41-NEXT:    movzbl (%r8,%rbp), %r8d
 ; SSE41-NEXT:    movzbl (%r13,%rbp), %r13d
 ; SSE41-NEXT:    movzbl (%r12,%rbp), %r12d
+; SSE41-NEXT:    movzbl (%r15,%rbp), %r15d
 ; SSE41-NEXT:    movzbl (%r14,%rbp), %r14d
-; SSE41-NEXT:    movzbl (%r11,%rbp), %r11d
-; SSE41-NEXT:    movzbl (%r10,%rbp), %r10d
-; SSE41-NEXT:    movzbl (%r9,%rbp), %r9d
-; SSE41-NEXT:    movzbl (%r8,%rbp), %ebp
+; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; SSE41-NEXT:    movzbl (%rdi,%rbp), %edi
+; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; SSE41-NEXT:    movzbl (%rbx,%rbp), %ebp
 ; SSE41-NEXT:    pinsrb $2, %ebp, %xmm0
-; SSE41-NEXT:    pinsrb $3, %r9d, %xmm0
-; SSE41-NEXT:    pinsrb $4, %r10d, %xmm0
-; SSE41-NEXT:    pinsrb $5, %r11d, %xmm0
-; SSE41-NEXT:    pinsrb $6, %r14d, %xmm0
-; SSE41-NEXT:    pinsrb $7, %r12d, %xmm0
-; SSE41-NEXT:    pinsrb $8, %r13d, %xmm0
-; SSE41-NEXT:    pinsrb $9, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $3, %edi, %xmm0
+; SSE41-NEXT:    pinsrb $4, %r14d, %xmm0
+; SSE41-NEXT:    pinsrb $5, %r15d, %xmm0
+; SSE41-NEXT:    pinsrb $6, %r12d, %xmm0
+; SSE41-NEXT:    pinsrb $7, %r13d, %xmm0
+; SSE41-NEXT:    pinsrb $8, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $9, %eax, %xmm0
 ; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
-; SSE41-NEXT:    pinsrb $11, %esi, %xmm0
-; SSE41-NEXT:    pinsrb $12, %ebx, %xmm0
-; SSE41-NEXT:    pinsrb $13, %eax, %xmm0
-; SSE41-NEXT:    pinsrb $14, %r15d, %xmm0
-; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
+; SSE41-NEXT:    pinsrb $11, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $12, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $13, %r11d, %xmm0
+; SSE41-NEXT:    pinsrb $14, %r9d, %xmm0
+; SSE41-NEXT:    pinsrb $15, %r10d, %xmm0
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
 ; SSE41-NEXT:    popq %r13
@@ -976,55 +1122,75 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; AVX-NEXT:    pushq %r13
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    movsbq (%rdi), %rsi
+; AVX-NEXT:    movzbl (%rdi), %r11d
+; AVX-NEXT:    andl $15, %r11d
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movsbq 1(%rdi), %r15
-; AVX-NEXT:    movsbq 2(%rdi), %r8
-; AVX-NEXT:    movsbq 3(%rdi), %r9
-; AVX-NEXT:    movsbq 4(%rdi), %r10
-; AVX-NEXT:    movsbq 5(%rdi), %r11
-; AVX-NEXT:    movsbq 6(%rdi), %r14
-; AVX-NEXT:    movsbq 7(%rdi), %r12
-; AVX-NEXT:    movsbq 8(%rdi), %r13
-; AVX-NEXT:    movsbq 9(%rdi), %rdx
-; AVX-NEXT:    movsbq 10(%rdi), %rax
-; AVX-NEXT:    movsbq 11(%rdi), %rcx
-; AVX-NEXT:    movsbq 12(%rdi), %rbx
+; AVX-NEXT:    movzbl 1(%rdi), %r9d
+; AVX-NEXT:    andl $15, %r9d
+; AVX-NEXT:    movzbl 2(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX-NEXT:    movzbl 3(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX-NEXT:    movzbl 4(%rdi), %r14d
+; AVX-NEXT:    andl $15, %r14d
+; AVX-NEXT:    movzbl 5(%rdi), %r15d
+; AVX-NEXT:    andl $15, %r15d
+; AVX-NEXT:    movzbl 6(%rdi), %r12d
+; AVX-NEXT:    andl $15, %r12d
+; AVX-NEXT:    movzbl 7(%rdi), %r13d
+; AVX-NEXT:    andl $15, %r13d
+; AVX-NEXT:    movzbl 8(%rdi), %r8d
+; AVX-NEXT:    andl $15, %r8d
+; AVX-NEXT:    movzbl 9(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    movzbl 10(%rdi), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    movzbl 11(%rdi), %edx
+; AVX-NEXT:    andl $15, %edx
+; AVX-NEXT:    movzbl 12(%rdi), %esi
+; AVX-NEXT:    andl $15, %esi
 ; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT:    movzbl (%r11,%rbp), %ebx
+; AVX-NEXT:    vmovd %ebx, %xmm0
+; AVX-NEXT:    movzbl 13(%rdi), %r11d
+; AVX-NEXT:    andl $15, %r11d
+; AVX-NEXT:    vpinsrb $1, (%r9,%rbp), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 14(%rdi), %ebx
+; AVX-NEXT:    andl $15, %ebx
+; AVX-NEXT:    movzbl 15(%rdi), %edi
+; AVX-NEXT:    andl $15, %edi
+; AVX-NEXT:    movzbl (%rdi,%rbp), %r10d
+; AVX-NEXT:    movzbl (%rbx,%rbp), %r9d
+; AVX-NEXT:    movzbl (%r11,%rbp), %r11d
 ; AVX-NEXT:    movzbl (%rsi,%rbp), %esi
-; AVX-NEXT:    vmovd %esi, %xmm0
-; AVX-NEXT:    movsbq 13(%rdi), %rsi
-; AVX-NEXT:    vpinsrb $1, (%r15,%rbp), %xmm0, %xmm0
-; AVX-NEXT:    movsbq 14(%rdi), %r15
-; AVX-NEXT:    movsbq 15(%rdi), %rdi
-; AVX-NEXT:    movzbl (%rdi,%rbp), %edi
-; AVX-NEXT:    movzbl (%r15,%rbp), %r15d
-; AVX-NEXT:    movzbl (%rsi,%rbp), %esi
-; AVX-NEXT:    movzbl (%rbx,%rbp), %ebx
+; AVX-NEXT:    movzbl (%rdx,%rbp), %edx
 ; AVX-NEXT:    movzbl (%rcx,%rbp), %ecx
 ; AVX-NEXT:    movzbl (%rax,%rbp), %eax
-; AVX-NEXT:    movzbl (%rdx,%rbp), %edx
+; AVX-NEXT:    movzbl (%r8,%rbp), %r8d
 ; AVX-NEXT:    movzbl (%r13,%rbp), %r13d
 ; AVX-NEXT:    movzbl (%r12,%rbp), %r12d
+; AVX-NEXT:    movzbl (%r15,%rbp), %r15d
 ; AVX-NEXT:    movzbl (%r14,%rbp), %r14d
-; AVX-NEXT:    movzbl (%r11,%rbp), %r11d
-; AVX-NEXT:    movzbl (%r10,%rbp), %r10d
-; AVX-NEXT:    movzbl (%r9,%rbp), %r9d
-; AVX-NEXT:    movzbl (%r8,%rbp), %ebp
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; AVX-NEXT:    movzbl (%rdi,%rbp), %edi
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; AVX-NEXT:    movzbl (%rbx,%rbp), %ebp
 ; AVX-NEXT:    vpinsrb $2, %ebp, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $3, %r9d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $5, %r11d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $6, %r14d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, %r12d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, %r13d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, %edx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, %ebx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, %esi, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, %r15d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $8, %r8d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $11, %edx, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $14, %r9d, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %r10d, %xmm0, %xmm0
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
 ; AVX-NEXT:    popq %r13
@@ -1106,11 +1272,14 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
 ; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movslq %edi, %rax
+; SSE-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT:    andl $3, %edi
 ; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movslq %edx, %rdx
+; SSE-NEXT:    andl $3, %edx
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movslq %ecx, %rcx
+; SSE-NEXT:    andl $3, %ecx
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1120,11 +1289,14 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float>
 ;
 ; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movslq %edi, %rax
+; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT:    andl $3, %edi
 ; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %edx, %rdx
+; AVX-NEXT:    andl $3, %edx
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movslq %ecx, %rcx
+; AVX-NEXT:    andl $3, %ecx
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1151,31 +1323,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE2-NEXT:    movswq %di, %r10
-; SSE2-NEXT:    movswq %si, %rsi
-; SSE2-NEXT:    movswq %dx, %r11
-; SSE2-NEXT:    movswq %cx, %rcx
+; SSE2-NEXT:    andl $7, %edi
+; SSE2-NEXT:    andl $7, %esi
+; SSE2-NEXT:    andl $7, %edx
+; SSE2-NEXT:    andl $7, %ecx
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movswq %r8w, %rdi
+; SSE2-NEXT:    andl $7, %r8d
 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movswq %r9w, %rax
-; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    andl $7, %r9d
+; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
+; SSE2-NEXT:    xorl %esi, %esi
+; SSE2-NEXT:    movd %esi, %xmm0
 ; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd %esi, %xmm2
-; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzwl -40(%rsp,%r10,2), %eax
-; SSE2-NEXT:    movzwl -40(%rsp,%r11,2), %ecx
+; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT:    movzwl -40(%rsp,%rdx,2), %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT:    movzwl -40(%rsp,%r8,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1190,31 +1362,31 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSSE3-NEXT:    movswq %di, %r10
-; SSSE3-NEXT:    movswq %si, %rsi
-; SSSE3-NEXT:    movswq %dx, %r11
-; SSSE3-NEXT:    movswq %cx, %rcx
+; SSSE3-NEXT:    andl $7, %edi
+; SSSE3-NEXT:    andl $7, %esi
+; SSSE3-NEXT:    andl $7, %edx
+; SSSE3-NEXT:    andl $7, %ecx
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movswq %r8w, %rdi
+; SSSE3-NEXT:    andl $7, %r8d
 ; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movswq %r9w, %rax
-; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSSE3-NEXT:    xorl %edx, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    andl $7, %r9d
+; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT:    xorl %esi, %esi
+; SSSE3-NEXT:    movd %esi, %xmm0
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movd %esi, %xmm2
-; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    movzwl -40(%rsp,%r10,2), %eax
-; SSSE3-NEXT:    movzwl -40(%rsp,%r11,2), %ecx
+; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT:    movzwl -40(%rsp,%rdx,2), %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT:    movzwl -40(%rsp,%r8,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1229,21 +1401,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; SSE41-NEXT:    movswq %di, %rax
-; SSE41-NEXT:    movswq %si, %rsi
-; SSE41-NEXT:    movswq %dx, %rdx
-; SSE41-NEXT:    movswq %cx, %r10
+; SSE41-NEXT:    andl $7, %edi
+; SSE41-NEXT:    andl $7, %esi
+; SSE41-NEXT:    andl $7, %edx
+; SSE41-NEXT:    andl $7, %ecx
 ; SSE41-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movswq %r8w, %rdi
+; SSE41-NEXT:    andl $7, %r8d
 ; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movswq %r9w, %rcx
-; SSE41-NEXT:    movzwl -40(%rsp,%rax,2), %eax
+; SSE41-NEXT:    andl $7, %r9d
+; SSE41-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
 ; SSE41-NEXT:    movd %eax, %xmm1
 ; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm1
 ; SSE41-NEXT:    pinsrw $2, -40(%rsp,%rdx,2), %xmm1
-; SSE41-NEXT:    pinsrw $3, -24(%rsp,%r10,2), %xmm1
-; SSE41-NEXT:    pinsrw $4, -40(%rsp,%rdi,2), %xmm1
-; SSE41-NEXT:    pinsrw $5, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT:    pinsrw $4, -40(%rsp,%r8,2), %xmm1
+; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm1
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
 ; SSE41-NEXT:    retq
@@ -1256,21 +1428,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX1-NEXT:    movswq %di, %r10
-; AVX1-NEXT:    movswq %si, %r11
-; AVX1-NEXT:    movswq %dx, %rdx
-; AVX1-NEXT:    movswq %cx, %rcx
+; AVX1-NEXT:    andl $7, %edi
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    andl $7, %ecx
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movswq %r8w, %rdi
+; AVX1-NEXT:    andl $7, %r8d
 ; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movswq %r9w, %rax
-; AVX1-NEXT:    movzwl -40(%rsp,%r10,2), %esi
-; AVX1-NEXT:    vmovd %esi, %xmm0
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; AVX1-NEXT:    retq
@@ -1283,21 +1455,21 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
 ; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX2-NEXT:    movswq %di, %r10
-; AVX2-NEXT:    movswq %si, %r11
-; AVX2-NEXT:    movswq %dx, %rdx
-; AVX2-NEXT:    movswq %cx, %rcx
+; AVX2-NEXT:    andl $7, %edi
+; AVX2-NEXT:    andl $7, %esi
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    andl $7, %ecx
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movswq %r8w, %rdi
+; AVX2-NEXT:    andl $7, %r8d
 ; AVX2-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movswq %r9w, %rax
-; AVX2-NEXT:    movzwl -40(%rsp,%r10,2), %esi
-; AVX2-NEXT:    vmovd %esi, %xmm0
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll
index b43ec058ed918..42b3c11d3d6bf 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -13,6 +13,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0,
 ; ALL-NEXT:    movq %rsp, %rbp
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    andl $3, %ecx
+; ALL-NEXT:    andl $3, %edx
+; ALL-NEXT:    andl $3, %esi
+; ALL-NEXT:    andl $3, %edi
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -40,6 +44,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
 ; ALL-NEXT:    movq %rsp, %rbp
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    andl $3, %edx
+; ALL-NEXT:    andl $3, %esi
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
@@ -62,6 +68,10 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
 define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
 ; ALL:       # BB#0:
+; ALL-NEXT:    andl $1, %ecx
+; ALL-NEXT:    andl $1, %edx
+; ALL-NEXT:    andl $1, %esi
+; ALL-NEXT:    andl $1, %edi
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -87,6 +97,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    andl $3, %ecx
+; AVX1-NEXT:    andl $3, %edx
+; AVX1-NEXT:    andl $3, %esi
+; AVX1-NEXT:    andl $3, %edi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -105,6 +119,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    andl $3, %ecx
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    andl $3, %edi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -134,6 +152,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    andl $3, %esi
+; AVX1-NEXT:    andl $3, %edi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -150,6 +170,8 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    andl $3, %edi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -173,6 +195,10 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
 ; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
 ; AVX1:       # BB#0:
+; AVX1-NEXT:    andl $1, %ecx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    andl $1, %edi
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -185,6 +211,10 @@ define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i
 ;
 ; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    andl $1, %ecx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    andl $1, %edi
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -212,15 +242,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    movslq %edi, %rax
-; AVX1-NEXT:    movslq %esi, %rsi
-; AVX1-NEXT:    movslq %edx, %rdx
-; AVX1-NEXT:    movslq %ecx, %r11
-; AVX1-NEXT:    movslq %r8d, %r10
+; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX1-NEXT:    andl $7, %edi
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    andl $7, %ecx
+; AVX1-NEXT:    andl $7, %r8d
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    movslq %r9d, %r8
-; AVX1-NEXT:    movslq 16(%rbp), %rdi
-; AVX1-NEXT:    movslq 24(%rbp), %rcx
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    movl 16(%rbp), %r10d
+; AVX1-NEXT:    andl $7, %r10d
+; AVX1-NEXT:    movl 24(%rbp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -284,15 +322,23 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
 define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
 ; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
 ; ALL:       # BB#0:
-; ALL-NEXT:    movslq %edi, %rax
-; ALL-NEXT:    movslq %esi, %rsi
-; ALL-NEXT:    movslq %edx, %rdx
-; ALL-NEXT:    movslq %ecx, %r11
-; ALL-NEXT:    movslq %r8d, %r10
+; ALL-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; ALL-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; ALL-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ALL-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT:    andl $3, %edi
+; ALL-NEXT:    andl $3, %esi
+; ALL-NEXT:    andl $3, %edx
+; ALL-NEXT:    andl $3, %ecx
+; ALL-NEXT:    andl $3, %r8d
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    movslq %r9d, %r8
-; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rdi
-; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rcx
+; ALL-NEXT:    andl $3, %r9d
+; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; ALL-NEXT:    andl $3, %r10d
+; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; ALL-NEXT:    andl $3, %eax
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -331,48 +377,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
+; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    movslq 32(%rbp), %rax
+; AVX1-NEXT:    movl 32(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    movslq 40(%rbp), %rax
+; AVX1-NEXT:    movl 40(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 48(%rbp), %rax
+; AVX1-NEXT:    movl 48(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 56(%rbp), %rax
+; AVX1-NEXT:    movl 56(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 64(%rbp), %rax
+; AVX1-NEXT:    movl 64(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 72(%rbp), %rax
+; AVX1-NEXT:    movl 72(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 80(%rbp), %rax
+; AVX1-NEXT:    movl 80(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq 88(%rbp), %rax
+; AVX1-NEXT:    movl 88(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq %edi, %rax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT:    andl $15, %edi
+; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
-; AVX1-NEXT:    movslq %esi, %rax
-; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %edx, %rax
-; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %ecx, %rax
-; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r8d, %rax
-; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r9d, %rax
-; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq 16(%rbp), %rax
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %r8d
+; AVX1-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %r9d
+; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT:    movl 16(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    movslq 24(%rbp), %rax
+; AVX1-NEXT:    movl 24(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -386,48 +448,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
+; AVX2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX2-NEXT:    movslq 32(%rbp), %rax
+; AVX2-NEXT:    movl 32(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    movslq 40(%rbp), %rax
+; AVX2-NEXT:    movl 40(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 48(%rbp), %rax
+; AVX2-NEXT:    movl 48(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 56(%rbp), %rax
+; AVX2-NEXT:    movl 56(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 64(%rbp), %rax
+; AVX2-NEXT:    movl 64(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 72(%rbp), %rax
+; AVX2-NEXT:    movl 72(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 80(%rbp), %rax
+; AVX2-NEXT:    movl 80(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq 88(%rbp), %rax
+; AVX2-NEXT:    movl 88(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq %edi, %rax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT:    andl $15, %edi
+; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    movslq %esi, %rax
-; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %edx, %rax
-; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %ecx, %rax
-; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r8d, %rax
-; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r9d, %rax
-; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq 16(%rbp), %rax
+; AVX2-NEXT:    andl $15, %esi
+; AVX2-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %r8d
+; AVX2-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %r9d
+; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT:    movl 16(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movslq 24(%rbp), %rax
+; AVX2-NEXT:    movl 24(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -472,48 +550,64 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
 ; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
 ; AVX1:       # BB#0:
+; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    movslq %edi, %rax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT:    andl $7, %edi
+; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
-; AVX1-NEXT:    movslq %esi, %rax
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %edx, %rax
-; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %ecx, %rax
-; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r8d, %rax
-; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq %r9d, %rax
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %ecx
+; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %r8d
+; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -521,48 +615,64 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ;
 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    movslq %edi, %rax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT:    andl $7, %edi
+; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    movslq %esi, %rax
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %edx, %rax
-; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %ecx, %rax
-; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r8d, %rax
-; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq %r9d, %rax
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $7, %esi
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %ecx
+; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %r8d
+; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -615,8 +725,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
 ; AVX1-NEXT:    subq $64, %rsp
 ; AVX1-NEXT:    movq (%rdi), %rax
 ; AVX1-NEXT:    movq 8(%rdi), %rcx
+; AVX1-NEXT:    andl $3, %eax
+; AVX1-NEXT:    andl $3, %ecx
 ; AVX1-NEXT:    movq 16(%rdi), %rdx
+; AVX1-NEXT:    andl $3, %edx
 ; AVX1-NEXT:    movq 24(%rdi), %rsi
+; AVX1-NEXT:    andl $3, %esi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -637,8 +751,12 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
 ; AVX2-NEXT:    subq $64, %rsp
 ; AVX2-NEXT:    movq (%rdi), %rax
 ; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    andl $3, %eax
+; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    andl $3, %edx
 ; AVX2-NEXT:    movq 24(%rdi), %rsi
+; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -674,8 +792,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    movq (%rdi), %rax
 ; AVX1-NEXT:    movq 8(%rdi), %rcx
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    andl $1, %ecx
 ; AVX1-NEXT:    movq 16(%rdi), %rdx
+; AVX1-NEXT:    andl $1, %edx
 ; AVX1-NEXT:    movq 24(%rdi), %rsi
+; AVX1-NEXT:    andl $1, %esi
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
@@ -690,8 +812,12 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movq (%rdi), %rax
 ; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    andl $1, %edx
 ; AVX2-NEXT:    movq 24(%rdi), %rsi
+; AVX2-NEXT:    andl $1, %esi
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
diff --git a/test/CodeGen/X86/x86-64-double-shifts-var.ll b/test/CodeGen/X86/x86-64-double-shifts-var.ll
index 8d2dbbdb5d245..c025ee874b2d5 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-var.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -17,6 +17,7 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
 ; RUN: llc < %s -march=x86-64 -mcpu=bdver3 | FileCheck %s
 ; RUN: llc < %s -march=x86-64 -mcpu=bdver4 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=znver1 | FileCheck %s
 
 ; Verify that for the X86_64 processors that are known to have poor latency 
 ; double precision shift instructions we do not generate 'shld' or 'shrd'
diff --git a/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll b/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll
index a62def35acc53..2185fbb845e5f 100644
--- a/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll
+++ b/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll
@@ -48,6 +48,43 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 %b.addr.0, !dbg !14
 }
 
+; When the commoned instructions have the same debug location, this location
+; should be used as the location of the common instruction.
+
+; Generated from source (with -mllvm -no-discriminators and -gno-column-info):
+
+; int test2(int a, int b) {
+;   if(a) b -= foo(); else b -= bar();
+;   return b;
+; }
+
+; CHECK: define i32 @test2
+; CHECK-LABEL: if.end:
+; CHECK: %[[PHI:.*]] = phi i32 [ %call1, %if.else ], [ %call, %if.then ]
+; CHECK: sub nsw i32 %b, %[[PHI]], !dbg ![[DBG:.*]]
+; CHECK: ret i32
+; CHECK: ![[DBG]] = !DILocation(line: 17, scope: !{{.*}})
+
+define i32 @test2(i32 %a, i32 %b) !dbg !15 {
+entry:
+  %tobool = icmp ne i32 %a, 0, !dbg !16
+  br i1 %tobool, label %if.then, label %if.else, !dbg !16
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @foo(), !dbg !16
+  %sub = sub nsw i32 %b, %call, !dbg !16
+  br label %if.end, !dbg !16
+
+if.else:                                          ; preds = %entry
+  %call1 = call i32 @bar(), !dbg !16
+  %sub2 = sub nsw i32 %b, %call1, !dbg !16
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %b.addr.0 = phi i32 [ %sub, %if.then ], [ %sub2, %if.else ]
+  ret i32 %b.addr.0, !dbg !17
+}
+
 declare i32 @foo()
 declare i32 @bar()
 
@@ -68,3 +105,6 @@ declare i32 @bar()
 !12 = !DILocation(line: 12, column: 10, scope: !6)
 !13 = !DILocation(line: 12, column: 7, scope: !6)
 !14 = !DILocation(line: 13, column: 3, scope: !6)
+!15 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 16, type: !7, isLocal: false, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!16 = !DILocation(line: 17, scope: !15)
+!17 = !DILocation(line: 18, scope: !15)
diff --git a/test/DebugInfo/Inputs/implicit-const-test.o b/test/DebugInfo/Inputs/implicit-const-test.o
new file mode 100644
index 0000000000000..eef5cc930313a
--- /dev/null
+++ b/test/DebugInfo/Inputs/implicit-const-test.o
diff --git a/test/DebugInfo/dwarfdump-implicit-const.test b/test/DebugInfo/dwarfdump-implicit-const.test
new file mode 100644
index 0000000000000..5458ada1a17a3
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-implicit-const.test
@@ -0,0 +1,2 @@
+RUN: llvm-dwarfdump -debug-dump=abbrev %p/Inputs/implicit-const-test.o | FileCheck %s
+CHECK: DW_FORM_implicit_const -9223372036854775808
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
index a9ec009395043..7ef07d72200d3 100644
--- a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s
@@ -33,6 +33,9 @@ k:
 r:
 # R_AARCH64_PREL32: use Q instead of f to fit in 32 bits.
         .word  Q - .
+# R_AARCH64_PREL64
+        .p2align        3
+        .xword f - .
 
 # LE instructions read as BE
 # rtdyld-check: *{4}(g) = 0x6024e0d2
@@ -41,3 +44,4 @@ r:
 # rtdyld-check: *{4}(g + 12) = 0xe0bd99f2
 # rtdyld-check: *{8}k = f
 # rtdyld-check: *{4}r = (Q - r)[31:0]
+# rtdyld-check: *{8}(r + 8) = f - r - 8
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
index f9a03ab40667c..069170bdf36bd 100644
--- a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s
@@ -47,7 +47,10 @@ k:
         .size   k, 16
 r:
 # R_AARCH64_PREL32: use Q instead of f to fit in 32 bits.
-        .word  Q - .
+        .word  Q - .        
+# R_AARCH64_PREL64
+        .p2align        3
+        .xword f - .
 
 # rtdyld-check: *{4}(g) = 0xd2e02460
 # rtdyld-check: *{4}(g + 4) = 0xf2c8ace0
@@ -65,6 +68,7 @@ r:
 
 # rtdyld-check: *{8}k = f
 # rtdyld-check: *{4}r = (Q - r)[31:0]
+# rtdyld-check: *{8}(r + 8) = f - r - 8
 
 ## f & 0xFFF = 0xdef (bits 11:0 of f)
 ## 0xdef << 10 = 0x37bc00
diff --git a/test/FileCheck/match-full-lines.txt b/test/FileCheck/match-full-lines.txt
new file mode 100644
index 0000000000000..d6b10a5e3a9b9
--- /dev/null
+++ b/test/FileCheck/match-full-lines.txt
@@ -0,0 +1,53 @@
+// RUN: not FileCheck -match-full-lines -input-file %s %s  2>&1 \
+// RUN:   | FileCheck --check-prefix=ERROR --implicit-check-not=error: %s
+// RUN: not FileCheck -match-full-lines -strict-whitespace -input-file %s %s  2>&1 \
+// RUN:   | FileCheck --check-prefix=ERROR-STRICT --check-prefix=ERROR --implicit-check-not=error: %s
+
+Label 1
+a line
+trailing whitespace   
+trailing more whitespace   
+Label 2
+a line
+   leading whitespace
+   leading more whitespace
+
+Label 3
+a line
+
+Label 4
+a line
+a random thing
+
+Label 5
+Label 66
+
+// CHECK-LABEL:Label 1
+// CHECK:a line
+// CHECK:trailing whitespace
+// CHECK:trailing more whitespace   
+// ERROR-STRICT:error: expected string not found in input
+// ERROR-STRICT:// {{C}}HECK:trailing whitespace
+
+// CHECK-LABEL:Label 2
+// CHECK:a line
+// CHECK-NEXT:leading whitespace
+// CHECK-NEXT:   leading more whitespace
+// ERROR-STRICT:error: expected string not found in input
+// ERROR-STRICT:// {{C}}HECK-NEXT:leading whitespace
+
+// CHECK-LABEL:Label 3
+// CHECK:line
+// ERROR:error: expected string not found in input
+// ERROR:// {{C}}HECK:line
+
+// CHECK-LABEL:Label 4
+// CHECK:a line
+// CHECK-NOT:random
+// ERROR:error: {{C}}HECK-NOT: string occurred!
+// ERROR:a random thing
+
+// CHECK-LABEL:Label 5
+// CHECK-LABEL:Label 6
+// ERROR:error: expected string not found in input
+// ERROR:{{C}}HECK-LABEL:Label 6
diff --git a/test/FileCheck/strict-whitespace-match-full-lines.txt b/test/FileCheck/strict-whitespace-match-full-lines.txt
deleted file mode 100644
index b0b6135b05d30..0000000000000
--- a/test/FileCheck/strict-whitespace-match-full-lines.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: sed 's/^;.*$//' %s \
-; RUN: | FileCheck --strict-whitespace --match-full-lines %s
-
-bla1
-bla2
- bla3
-bla4 
- bla5 
-
-; CHECK-LABEL:bla1
-; CHECK-NEXT:bla2
-; CHECK-NEXT: bla3
-; CHECK-NEXT:bla4 
-; CHECK-NEXT: bla5 
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll b/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
index af4da14d786f7..5d510014a12a9 100644
--- a/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
+++ b/test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll
@@ -16,7 +16,7 @@ target triple = "x86_64-apple-macosx10.11.0"
 
 
 ; Find the metadata for @global:
-; CHECK: [[METADATA:@.+]] = internal global {{.*}} @global {{.*}} section "__DATA,__asan_globals,regular", align 1
+; CHECK: [[METADATA:@.+]] = internal global {{.*}} @global {{.*}} section "__DATA,__asan_globals,regular", align 64
 
 ; Find the liveness binder for @global and its metadata:
 ; CHECK: @__asan_binder_global = internal global {{.*}} @global {{.*}} [[METADATA]] {{.*}} section "__DATA,__asan_liveness,regular,live_support"
diff --git a/test/MC/AMDGPU/vop_dpp.s b/test/MC/AMDGPU/vop_dpp.s
index 52b5df3a44c8a..19f21c48ca217 100644
--- a/test/MC/AMDGPU/vop_dpp.s
+++ b/test/MC/AMDGPU/vop_dpp.s
@@ -1,7 +1,9 @@
-// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+
 // RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOVI
 
 //===----------------------------------------------------------------------===//
 // Check dpp_ctrl values
@@ -527,3 +529,31 @@ v_subb_u32 v1, vcc, v2, v3, vcc row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:
 // NOSICI: error:
 // VI: v_subbrev_u32_dpp v1, vcc, v2, v3, vcc row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x06,0x02,0x3c,0x02,0x01,0x09,0xa1]
 v_subbrev_u32 v1, vcc, v2, v3, vcc row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+//===----------------------------------------------------------------------===//
+// Check that immideates and scalar regs are not supported
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error:
+v_mov_b32 v0, 1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_and_b32 v0, 42, v1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_add_f32 v0, v1, 345 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_mov_b32 v0, s1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_and_b32 v0, s42, v1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
+
+// NOSICI: error:
+// NOVI: error:
+v_add_f32 v0, v1, s45 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
diff --git a/test/MC/AMDGPU/vop_sdwa.s b/test/MC/AMDGPU/vop_sdwa.s
index f139e7c908ff9..75db3259f43c5 100644
--- a/test/MC/AMDGPU/vop_sdwa.s
+++ b/test/MC/AMDGPU/vop_sdwa.s
@@ -594,3 +594,39 @@ v_cmp_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 // NOSICI: error:
 // VI: v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x02,0x04]
 v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+
+//===----------------------------------------------------------------------===//
+// Check that immideates and scalar regs are not supported
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_mov_b32 v0, 1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_and_b32 v0, 42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_add_f32 v0, v1, 345 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_cmpx_class_f32 vcc, -1, 200 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_mov_b32 v0, s1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_and_b32 v0, s42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_add_f32 v0, v1, s45 src0_sel:BYTE_2 src1_sel:WORD_0
+
+// NOSICI: error:
+// NOVI: error: invalid operand for instruction
+v_cmpx_class_f32 vcc, s1, s2 src0_sel:BYTE_2 src1_sel:WORD_0
diff --git a/test/MC/ARM/directive-object_arch-2.s b/test/MC/ARM/directive-object_arch-2.s
index 3aca434a1e4c6..f0596f12385ca 100644
--- a/test/MC/ARM/directive-object_arch-2.s
+++ b/test/MC/ARM/directive-object_arch-2.s
@@ -14,7 +14,7 @@
 @ CHECK:   }
 @ CHECK:   Attribute {
 @ CHECK:     Tag: 6
-@ CHEKC:     Value: 1
+@ CHECK:     Value: 1
 @ CHECK:     TagName: CPU_arch
 @ CHECK:     Description: ARM v4
 @ CHECK:   }
diff --git a/test/MC/ARM/directive-object_arch.s b/test/MC/ARM/directive-object_arch.s
index 0707077630e03..c211a3bb57e15 100644
--- a/test/MC/ARM/directive-object_arch.s
+++ b/test/MC/ARM/directive-object_arch.s
@@ -14,7 +14,7 @@
 @ CHECK:   }
 @ CHECK:   Attribute {
 @ CHECK:     Tag: 6
-@ CHEKC:     Value: 1
+@ CHECK:     Value: 1
 @ CHECK:     TagName: CPU_arch
 @ CHECK:     Description: ARM v4
 @ CHECK:   }
diff --git a/test/ObjectYAML/MachO/DWARF-debug_info.yaml b/test/ObjectYAML/MachO/DWARF-debug_info.yaml
index 9a616e9afb9d5..b1b6b8ad19e8c 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_info.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_info.yaml
@@ -451,6 +451,58 @@ DWARF:
             - Value:           0x0000000000000001
         - AbbrCode:        0x00000000
           Values:          
+  debug_line:      
+    - TotalLength:     65
+      Version:         2
+      PrologueLength:  36
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: 
+        - 0
+        - 1
+        - 1
+        - 1
+        - 1
+        - 0
+        - 0
+        - 0
+        - 1
+        - 0
+        - 0
+        - 1
+      IncludeDirs:     
+      Files:           
+        - Name:            hello_world.c
+          DirIdx:          0
+          ModTime:         0
+          Length:          0
+      Opcodes:         
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294971216
+        - Opcode:          0x14
+          Data:            4294971216
+        - Opcode:          DW_LNS_set_column
+          Data:            3
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            3
+        - Opcode:          DW_LNS_const_add_pc
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          DW_LNS_advance_pc
+          Data:            11
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            11
+...
 ...
 
 
diff --git a/test/ObjectYAML/MachO/DWARF-debug_line.yaml b/test/ObjectYAML/MachO/DWARF-debug_line.yaml
new file mode 100644
index 0000000000000..c1e015839f975
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF-debug_line.yaml
@@ -0,0 +1,595 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           7
+  sizeofcmds:      1848
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            B4D48511-37F4-3ED4-AFA7-1683DCE69AC4
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          4096
+    nsyms:           2
+    stroff:          4128
+    strsize:         28
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000F50
+        size:            52
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000100000F84
+        size:            6
+        offset:          0x00000000
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000100000F8C
+        size:            26
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x0000000100000FA6
+        size:            14
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000002
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB4
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001000
+        size:            16
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001010
+        size:            8
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294979584
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        764
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100003000
+        size:            69
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100003045
+        size:            27
+        offset:          0x00002045
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100003060
+        size:            35
+        offset:          0x00002060
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100003083
+        size:            48
+        offset:          0x00002083
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000030B3
+        size:            121
+        offset:          0x000020B3
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000312C
+        size:            76
+        offset:          0x0000212C
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100003178
+        size:            142
+        offset:          0x00002178
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100003206
+        size:            60
+        offset:          0x00002206
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100003242
+        size:            36
+        offset:          0x00002242
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100003266
+        size:            114
+        offset:          0x00002266
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000032D8
+        size:            36
+        offset:          0x000022D8
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+LinkEditData:    
+  NameList:        
+    - n_strx:          2
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971216
+  StringTable:     
+    - ''
+    - ''
+    - __mh_execute_header
+    - _main
+DWARF:           
+  debug_str:       
+    - ''
+    - 'clang version 4.0.0 (trunk 288923) (llvm/trunk 288991)'
+    - hello_world.c
+    - /Users/cbieneman/dev/open-source/llvm-build-rel
+    - main
+    - argc
+    - argv
+    - int
+    - char
+  debug_abbrev:    
+    - Code:            0x00000001
+      Tag:             DW_TAG_compile_unit
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_producer
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_language
+          Form:            DW_FORM_data2
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_stmt_list
+          Form:            DW_FORM_sec_offset
+        - Attribute:       DW_AT_comp_dir
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+    - Code:            0x00000002
+      Tag:             DW_TAG_subprogram
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+        - Attribute:       DW_AT_frame_base
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_prototyped
+          Form:            DW_FORM_flag_present
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+        - Attribute:       DW_AT_external
+          Form:            DW_FORM_flag_present
+    - Code:            0x00000003
+      Tag:             DW_TAG_formal_parameter
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_location
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+    - Code:            0x00000004
+      Tag:             DW_TAG_base_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_encoding
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_byte_size
+          Form:            DW_FORM_data1
+    - Code:            0x00000005
+      Tag:             DW_TAG_pointer_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+  debug_aranges:   
+    - Length:          44
+      Version:         2
+      CuOffset:        0
+      AddrSize:        8
+      SegSize:         0
+      Descriptors:     
+        - Address:         0x0000000100000F50
+          Length:          52
+  debug_pubnames:  
+    Length:          23
+    Version:         2
+    UnitOffset:      0
+    UnitSize:        121
+    Entries:         
+      - DieOffset:       0x0000002A
+        Name:            main
+  debug_pubtypes:  
+    Length:          31
+    Version:         2
+    UnitOffset:      0
+    UnitSize:        121
+    Entries:         
+      - DieOffset:       0x00000060
+        Name:            int
+      - DieOffset:       0x00000071
+        Name:            char
+  debug_info:      
+    - Length:          117
+      Version:         4
+      AbbrOffset:      0
+      AddrSize:        8
+      Entries:         
+        - AbbrCode:        0x00000001
+          Values:          
+            - Value:           0x0000000000000001
+            - Value:           0x000000000000000C
+            - Value:           0x0000000000000038
+            - Value:           0x0000000000000000
+            - Value:           0x0000000000000046
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+        - AbbrCode:        0x00000002
+          Values:          
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+            - Value:           0x0000000000000001
+              BlockData:       
+                - 0x56
+            - Value:           0x0000000000000076
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000060
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000003
+          Values:          
+            - Value:           0x0000000000000002
+              BlockData:       
+                - 0x91
+                - 0x78
+            - Value:           0x000000000000007B
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000060
+        - AbbrCode:        0x00000003
+          Values:          
+            - Value:           0x0000000000000002
+              BlockData:       
+                - 0x91
+                - 0x70
+            - Value:           0x0000000000000080
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000067
+        - AbbrCode:        0x00000000
+          Values:          
+        - AbbrCode:        0x00000004
+          Values:          
+            - Value:           0x0000000000000085
+            - Value:           0x0000000000000005
+            - Value:           0x0000000000000004
+        - AbbrCode:        0x00000005
+          Values:          
+            - Value:           0x000000000000006C
+        - AbbrCode:        0x00000005
+          Values:          
+            - Value:           0x0000000000000071
+        - AbbrCode:        0x00000004
+          Values:          
+            - Value:           0x0000000000000089
+            - Value:           0x0000000000000006
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000000
+          Values:          
+  debug_line:      
+    - TotalLength:     65
+      Version:         2
+      PrologueLength:  36
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: 
+        - 0
+        - 1
+        - 1
+        - 1
+        - 1
+        - 0
+        - 0
+        - 0
+        - 1
+        - 0
+        - 0
+        - 1
+      IncludeDirs:     
+      Files:           
+        - Name:            hello_world.c
+          DirIdx:          0
+          ModTime:         0
+          Length:          0
+      Opcodes:         
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294971216
+        - Opcode:          0x14
+          Data:            4294971216
+        - Opcode:          DW_LNS_set_column
+          Data:            3
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            3
+        - Opcode:          DW_LNS_const_add_pc
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          DW_LNS_advance_pc
+          Data:            11
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            11
+...
+
+#CHECK:   debug_line:      
+#CHECK:     - TotalLength:     65
+#CHECK:       Version:         2
+#CHECK:       PrologueLength:  36
+#CHECK:       MinInstLength:   1
+#CHECK:       DefaultIsStmt:   1
+#CHECK:       LineBase:        251
+#CHECK:       LineRange:       14
+#CHECK:       OpcodeBase:      13
+#CHECK:       StandardOpcodeLengths: 
+#CHECK:         - 0
+#CHECK:         - 1
+#CHECK:         - 1
+#CHECK:         - 1
+#CHECK:         - 1
+#CHECK:         - 0
+#CHECK:         - 0
+#CHECK:         - 0
+#CHECK:         - 1
+#CHECK:         - 0
+#CHECK:         - 0
+#CHECK:         - 1
+#CHECK:       IncludeDirs:     
+#CHECK:       Files:           
+#CHECK:         - Name:            hello_world.c
+#CHECK:           DirIdx:          0
+#CHECK:           ModTime:         0
+#CHECK:           Length:          0
+#CHECK:       Opcodes:         
+#CHECK:         - Opcode:          DW_LNS_extended_op
+#CHECK:           ExtLen:          9
+#CHECK:           SubOpcode:       DW_LNE_set_address
+#CHECK:           Data:            4294971216
+#CHECK:         - Opcode:          0x14
+#CHECK:           Data:            4294971216
+#CHECK:         - Opcode:          DW_LNS_set_column
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          DW_LNS_set_prologue_end
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          DW_LNS_const_add_pc
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          0xBB
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          0xBB
+#CHECK:           Data:            3
+#CHECK:         - Opcode:          DW_LNS_advance_pc
+#CHECK:           Data:            11
+#CHECK:         - Opcode:          DW_LNS_extended_op
+#CHECK:           ExtLen:          1
+#CHECK:           SubOpcode:       DW_LNE_end_sequence
+#CHECK:           Data:            11
+#CHECK: ...
diff --git a/test/Other/loop-pass-ordering.ll b/test/Other/loop-pass-ordering.ll
index ceda0d3869dda..ab3839f5cc997 100644
--- a/test/Other/loop-pass-ordering.ll
+++ b/test/Other/loop-pass-ordering.ll
@@ -8,11 +8,12 @@
 ;      /      \        \
 ; loop.0.0  loop.0.1  loop.1.0
 ;
-; CHECK: Running pass: NoOpLoopPass on loop.1.0
-; CHECK: Running pass: NoOpLoopPass on loop.1
-; CHECK: Running pass: NoOpLoopPass on loop.0.0
-; CHECK: Running pass: NoOpLoopPass on loop.0.1
-; CHECK: Running pass: NoOpLoopPass on loop.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 2 containing: %loop.0.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 2 containing: %loop.0.1
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 1 containing: %loop.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 2 containing: %loop.1.0
+; CHECK: Running pass: NoOpLoopPass on Loop at depth 1 containing: %loop.1
+
 define void @f() {
 entry:
   br label %loop.0
diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll
index 6224af09a3f12..eae2d855e92f0 100644
--- a/test/Other/new-pass-manager.ll
+++ b/test/Other/new-pass-manager.ll
@@ -433,12 +433,12 @@
 ; CHECK-O: Running pass: TailCallElimPass
 ; CHECK-O: Running pass: SimplifyCFGPass
 ; CHECK-O: Running pass: ReassociatePass
-; CHECK-O: Starting llvm::Loop pass manager run.
-; CHECK-O: Finished llvm::Loop pass manager run.
+; CHECK-O: Starting Loop pass manager run.
+; CHECK-O: Finished Loop pass manager run.
 ; CHECK-O: Running pass: SimplifyCFGPass
 ; CHECK-O: Running pass: InstCombinePass
-; CHECK-O: Starting llvm::Loop pass manager run.
-; CHECK-O: Finished llvm::Loop pass manager run.
+; CHECK-O: Starting Loop pass manager run.
+; CHECK-O: Finished Loop pass manager run.
 ; CHECK-O: Running pass: MemCpyOptPass
 ; CHECK-O: Running pass: SCCPPass
 ; CHECK-O: Running pass: BDCEPass
@@ -544,20 +544,21 @@
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: DominatorTreeAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AAManager
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: TargetLibraryAnalysis
-; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: ScalarEvolutionAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AssumptionAnalysis
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: RepeatedPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: NoOpLoopPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: NoOpLoopPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
-; CHECK-REPEAT-LOOP-PASS-NEXT: Starting llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: NoOpLoopPass
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
-; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
+; CHECK-REPEAT-LOOP-PASS-NEXT: Finished Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Function pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Finished llvm::Module pass manager run
 
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index ad222dbef7aba..b303318c79636 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -144,10 +144,10 @@
 ; CHECK-TWO-NOOP-LOOP: Running pass: ModuleToFunctionPassAdaptor
 ; CHECK-TWO-NOOP-LOOP: Starting llvm::Function pass manager run
 ; CHECK-TWO-NOOP-LOOP: Running pass: FunctionToLoopPassAdaptor
-; CHECK-TWO-NOOP-LOOP: Starting llvm::Loop pass manager run
+; CHECK-TWO-NOOP-LOOP: Starting Loop pass manager run
 ; CHECK-TWO-NOOP-LOOP: Running pass: NoOpLoopPass
 ; CHECK-TWO-NOOP-LOOP: Running pass: NoOpLoopPass
-; CHECK-TWO-NOOP-LOOP: Finished llvm::Loop pass manager run
+; CHECK-TWO-NOOP-LOOP: Finished Loop pass manager run
 ; CHECK-TWO-NOOP-LOOP: Finished llvm::Function pass manager run
 ; CHECK-TWO-NOOP-LOOP: Finished llvm::Module pass manager run
 
@@ -167,9 +167,9 @@
 ; CHECK-NESTED-FP-LP: Running pass: ModuleToFunctionPassAdaptor
 ; CHECK-NESTED-FP-LP: Starting llvm::Function pass manager run
 ; CHECK-NESTED-FP-LP: Running pass: FunctionToLoopPassAdaptor
-; CHECK-NESTED-FP-LP: Starting llvm::Loop pass manager run
+; CHECK-NESTED-FP-LP: Starting Loop pass manager run
 ; CHECK-NESTED-FP-LP: Running pass: NoOpLoopPass
-; CHECK-NESTED-FP-LP: Finished llvm::Loop pass manager run
+; CHECK-NESTED-FP-LP: Finished Loop pass manager run
 ; CHECK-NESTED-FP-LP: Finished llvm::Function pass manager run
 ; CHECK-NESTED-FP-LP: Finished llvm::Module pass manager run
 
diff --git a/test/Transforms/GVN/assume-equal.ll b/test/Transforms/GVN/assume-equal.ll
index d423c1685e1de..941f14ce402cd 100644
--- a/test/Transforms/GVN/assume-equal.ll
+++ b/test/Transforms/GVN/assume-equal.ll
@@ -65,22 +65,20 @@ if.then:                                          ; preds = %entry
   %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
   %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
-; FIXME: those loads could be also direct, but right now the invariant.group
-; analysis works only on single block
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %callx = tail call i32 %call1(%struct.A* %0) #1
   
   %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
   %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %cally = tail call i32 %call4(%struct.A* %0) #1
   
   %b = bitcast i8* %call to %struct.A**
   %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
   %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
   %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %unknown = tail call i32 %vfun(%struct.A* %0) #1
   
   br label %if.end
diff --git a/test/Transforms/GVN/invariant.group.ll b/test/Transforms/GVN/invariant.group.ll
index d0b32d7f3dd8c..6f1f357cad656 100644
--- a/test/Transforms/GVN/invariant.group.ll
+++ b/test/Transforms/GVN/invariant.group.ll
@@ -392,6 +392,44 @@ define void @testNotGlobal() {
    ret void
 }
 
+; CHECK-LABEL: define void @handling_loops()
+define void @handling_loops() {
+  %a = alloca %struct.A, align 8
+  %1 = bitcast %struct.A* %a to i8*
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
+  %3 = load i8, i8* @unknownPtr, align 4
+  %4 = icmp sgt i8 %3, 0
+  br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
+
+.lr.ph.i:                                         ; preds = %0
+  %5 = bitcast %struct.A* %a to void (%struct.A*)***
+  %6 = load i8, i8* @unknownPtr, align 4
+  %7 = icmp sgt i8 %6, 1
+  br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
+
+._crit_edge.preheader:                            ; preds = %.lr.ph.i
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.preheader, %._crit_edge
+  %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
+  %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
+  %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
+  ; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+  call void %9(%struct.A* nonnull %a) #3
+  ; CHECK-NOT: call void %
+  %10 = add nuw nsw i8 %8, 1
+  %11 = load i8, i8* @unknownPtr, align 4
+  %12 = icmp slt i8 %10, %11
+  br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
+
+_Z2g2R1A.exit.loopexit:                           ; preds = %._crit_edge
+  br label %_Z2g2R1A.exit
+
+_Z2g2R1A.exit:                                    ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
+  ret void
+}
+
 
 declare void @foo(i8*)
 declare void @foo2(i8*, i8)
diff --git a/test/Transforms/InstCombine/fabs.ll b/test/Transforms/InstCombine/fabs.ll
index 6b5f5a9495301..aee853ae9eeba 100644
--- a/test/Transforms/InstCombine/fabs.ll
+++ b/test/Transforms/InstCombine/fabs.ll
@@ -5,6 +5,8 @@
 declare float @fabsf(float)
 declare double @fabs(double)
 declare fp128 @fabsl(fp128)
+declare float @llvm.fma.f32(float, float, float)
+declare float @llvm.fmuladd.f32(float, float, float)
 
 define float @square_fabs_call_f32(float %x) {
   %mul = fmul float %x, %x
@@ -80,7 +82,6 @@ define fp128 @square_fabs_intrinsic_f128(fp128 %x) {
 ; CHECK-NEXT: ret fp128 %fabsl
 }
 
-; TODO: This should be able to elimnated the fabs
 define float @square_nnan_fabs_intrinsic_f32(float %x) {
   %mul = fmul nnan float %x, %x
   %fabsf = call float @llvm.fabs.f32(float %mul)
@@ -88,8 +89,7 @@ define float @square_nnan_fabs_intrinsic_f32(float %x) {
 
 ; CHECK-LABEL: square_nnan_fabs_intrinsic_f32(
 ; CHECK-NEXT: %mul = fmul nnan float %x, %x
-; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %mul)
-; CHECK-NEXT: ret float %fabsf
+; CHECK-NEXT: ret float %mul
 }
 
 ; Shrinking a library call to a smaller type should not be inhibited by nor inhibit the square optimization.
@@ -170,3 +170,47 @@ define float @fabs_select_var_constant_negative(i32 %c, float %x) {
   %fabs = call float @llvm.fabs.f32(float %select)
   ret float %fabs
 }
+
+; The fabs cannot be eliminated because %x may be a NaN
+define float @square_fma_fabs_intrinsic_f32(float %x) {
+  %fma = call float @llvm.fma.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fma)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_fma_fabs_intrinsic_f32(
+; CHECK-NEXT: %fma = call float @llvm.fma.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %fma)
+; CHECK-NEXT: ret float %fabsf
+}
+
+; The fabs cannot be eliminated because %x may be a NaN
+define float @square_nnan_fma_fabs_intrinsic_f32(float %x) {
+  %fma = call nnan float @llvm.fma.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fma)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_nnan_fma_fabs_intrinsic_f32(
+; CHECK-NEXT: %fma = call nnan float @llvm.fma.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float %fma
+}
+
+define float @square_fmuladd_fabs_intrinsic_f32(float %x) {
+  %fmuladd = call float @llvm.fmuladd.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fmuladd)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_fmuladd_fabs_intrinsic_f32(
+; CHECK-NEXT: %fmuladd = call float @llvm.fmuladd.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: %fabsf = call float @llvm.fabs.f32(float %fmuladd)
+; CHECK-NEXT: ret float %fabsf
+}
+
+define float @square_nnan_fmuladd_fabs_intrinsic_f32(float %x) {
+  %fmuladd = call nnan float @llvm.fmuladd.f32(float %x, float %x, float 1.0)
+  %fabsf = call float @llvm.fabs.f32(float %fmuladd)
+  ret float %fabsf
+
+; CHECK-LABEL: @square_nnan_fmuladd_fabs_intrinsic_f32(
+; CHECK-NEXT: %fmuladd = call nnan float @llvm.fmuladd.f32(float %x, float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float %fmuladd
+}
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index 84f24ca0bf247..ad8a9247e4e1d 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -241,7 +241,7 @@ define float @fmul2(float %f1) {
 ; X/C1 * C2 => X * (C2/C1) is disabled if X/C1 has multiple uses
 @fmul2_external = external global float
 define float @fmul2_disable(float %f1) {
-  %div = fdiv fast float 1.000000e+00, %f1 
+  %div = fdiv fast float 1.000000e+00, %f1
   store float %div, float* @fmul2_external
   %mul = fmul fast float %div, 2.000000e+00
   ret float %mul
@@ -672,8 +672,7 @@ define double @sqrt_intrinsic_arg_4th(double %x) {
 
 ; CHECK-LABEL: sqrt_intrinsic_arg_4th(
 ; CHECK-NEXT: %mul = fmul fast double %x, %x
-; CHECK-NEXT: %fabs = call fast double @llvm.fabs.f64(double %mul)
-; CHECK-NEXT: ret double %fabs
+; CHECK-NEXT: ret double %mul
 }
 
 define double @sqrt_intrinsic_arg_5th(double %x) {
@@ -685,9 +684,8 @@ define double @sqrt_intrinsic_arg_5th(double %x) {
 
 ; CHECK-LABEL: sqrt_intrinsic_arg_5th(
 ; CHECK-NEXT: %mul = fmul fast double %x, %x
-; CHECK-NEXT: %fabs = call fast double @llvm.fabs.f64(double %mul)
 ; CHECK-NEXT: %sqrt1 = call fast double @llvm.sqrt.f64(double %x)
-; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: %1 = fmul fast double %mul, %sqrt1
 ; CHECK-NEXT: ret double %1
 }
 
diff --git a/test/Transforms/InstCombine/fdiv.ll b/test/Transforms/InstCombine/fdiv.ll
index af6a2401a8fc0..9a10c45233510 100644
--- a/test/Transforms/InstCombine/fdiv.ll
+++ b/test/Transforms/InstCombine/fdiv.ll
@@ -49,3 +49,21 @@ define float @test6(float %x, float %y, float %z) nounwind readnone ssp {
 ; CHECK-NEXT: fmul fast
 ; CHECK-NEXT: fdiv fast
 }
+
+; CHECK-LABEL @fdiv_fneg_fneg(
+; CHECK: %div = fdiv float %x, %y
+define float @fdiv_fneg_fneg(float %x, float %y) {
+  %x.fneg = fsub float -0.0, %x
+  %y.fneg = fsub float -0.0, %y
+  %div = fdiv float %x.fneg, %y.fneg
+  ret float %div
+}
+
+; CHECK-LABEL @fdiv_fneg_fneg_fast(
+; CHECK: %div = fdiv fast float %x, %y
+define float @fdiv_fneg_fneg_fast(float %x, float %y) {
+  %x.fneg = fsub float -0.0, %x
+  %y.fneg = fsub float -0.0, %y
+  %div = fdiv fast float %x.fneg, %y.fneg
+  ret float %div
+}
diff --git a/test/Transforms/InstCombine/pow-4.ll b/test/Transforms/InstCombine/pow-4.ll
index 911ab4d94c6ac..9293f14cb1061 100644
--- a/test/Transforms/InstCombine/pow-4.ll
+++ b/test/Transforms/InstCombine/pow-4.ll
@@ -10,8 +10,8 @@ declare float @llvm.pow.f32(float, float)
 define float @test_simplify_4f(float %x) {
 ; CHECK-LABEL: @test_simplify_4f(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul float %x, %x
-; CHECK-NEXT: %2 = fmul float %1, %1
+; CHECK-NEXT: %1 = fmul fast float %x, %x
+; CHECK-NEXT: %2 = fmul fast float %1, %1
 ; CHECK-NEXT: ret float %2
   %1 = call fast float @llvm.pow.f32(float %x, float 4.000000e+00)
   ret float %1
@@ -21,8 +21,8 @@ define float @test_simplify_4f(float %x) {
 define double @test_simplify_3(double %x) {
 ; CHECK-LABEL: @test_simplify_3(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %x
 ; CHECK-NEXT: ret double %2
   %1 = call fast double @llvm.pow.f64(double %x, double 3.000000e+00)
   ret double %1
@@ -32,8 +32,8 @@ define double @test_simplify_3(double %x) {
 define double @test_simplify_4(double %x) {
 ; CHECK-LABEL: @test_simplify_4(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
 ; CHECK-NEXT: ret double %2
   %1 = call fast double @llvm.pow.f64(double %x, double 4.000000e+00)
   ret double %1
@@ -43,11 +43,11 @@ define double @test_simplify_4(double %x) {
 define double @test_simplify_15(double %x) {
 ; CHECK-LABEL: @test_simplify_15(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %x
-; CHECK-NEXT: %3 = fmul double %2, %2
-; CHECK-NEXT: %4 = fmul double %3, %3
-; CHECK-NEXT: %5 = fmul double %2, %4
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %x
+; CHECK-NEXT: %3 = fmul fast double %2, %2
+; CHECK-NEXT: %4 = fmul fast double %3, %3
+; CHECK-NEXT: %5 = fmul fast double %2, %4
 ; CHECK-NEXT: ret double %5
   %1 = call fast double @llvm.pow.f64(double %x, double 1.500000e+01)
   ret double %1
@@ -57,11 +57,11 @@ define double @test_simplify_15(double %x) {
 define double @test_simplify_neg_7(double %x) {
 ; CHECK-LABEL: @test_simplify_neg_7(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %x
-; CHECK-NEXT: %3 = fmul double %1, %2
-; CHECK-NEXT: %4 = fmul double %1, %3
-; CHECK-NEXT: %5 = fdiv double 1.000000e+00, %4
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
+; CHECK-NEXT: %3 = fmul fast double %2, %x
+; CHECK-NEXT: %4 = fmul fast double %1, %3
+; CHECK-NEXT: %5 = fdiv fast double 1.000000e+00, %4
 ; CHECK-NEXT: ret double %5
   %1 = call fast double @llvm.pow.f64(double %x, double -7.000000e+00)
   ret double %1
@@ -71,13 +71,13 @@ define double @test_simplify_neg_7(double %x) {
 define double @test_simplify_neg_19(double %x) {
 ; CHECK-LABEL: @test_simplify_neg_19(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %1
-; CHECK-NEXT: %3 = fmul double %2, %2
-; CHECK-NEXT: %4 = fmul double %3, %3
-; CHECK-NEXT: %5 = fmul double %1, %4
-; CHECK-NEXT: %6 = fmul double %5, %x
-; CHECK-NEXT: %7 = fdiv double 1.000000e+00, %6
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
+; CHECK-NEXT: %3 = fmul fast double %2, %2
+; CHECK-NEXT: %4 = fmul fast double %3, %3
+; CHECK-NEXT: %5 = fmul fast double %1, %4
+; CHECK-NEXT: %6 = fmul fast double %5, %x
+; CHECK-NEXT: %7 = fdiv fast double 1.000000e+00, %6
 ; CHECK-NEXT: ret double %7
   %1 = call fast double @llvm.pow.f64(double %x, double -1.900000e+01)
   ret double %1
@@ -97,11 +97,11 @@ define double @test_simplify_11_23(double %x) {
 define double @test_simplify_32(double %x) {
 ; CHECK-LABEL: @test_simplify_32(
 ; CHECK-NOT: pow
-; CHECK-NEXT: %1 = fmul double %x, %x
-; CHECK-NEXT: %2 = fmul double %1, %1
-; CHECK-NEXT: %3 = fmul double %2, %2
-; CHECK-NEXT: %4 = fmul double %3, %3
-; CHECK-NEXT: %5 = fmul double %4, %4
+; CHECK-NEXT: %1 = fmul fast double %x, %x
+; CHECK-NEXT: %2 = fmul fast double %1, %1
+; CHECK-NEXT: %3 = fmul fast double %2, %2
+; CHECK-NEXT: %4 = fmul fast double %3, %3
+; CHECK-NEXT: %5 = fmul fast double %4, %4
 ; CHECK-NEXT: ret double %5
   %1 = call fast double @llvm.pow.f64(double %x, double 3.200000e+01)
   ret double %1
diff --git a/test/Transforms/InstCombine/pow-sqrt.ll b/test/Transforms/InstCombine/pow-sqrt.ll
index 1e6166c5f1142..52175f1b12479 100644
--- a/test/Transforms/InstCombine/pow-sqrt.ll
+++ b/test/Transforms/InstCombine/pow-sqrt.ll
@@ -9,5 +9,14 @@ define double @pow_half(double %x) {
 ; CHECK-NEXT:  %sqrt = call fast double @sqrt(double %x)
 ; CHECK-NEXT:  ret double %sqrt
 
-declare double @llvm.pow.f64(double, double)
+define double @pow_neghalf(double %x) {
+  %pow = call fast double @llvm.pow.f64(double %x, double -5.000000e-01)
+  ret double %pow
+}
 
+; CHECK-LABEL: define double @pow_neghalf(
+; CHECK-NEXT: %sqrt = call fast double @sqrt(double %x) #0
+; CHECK-NEXT: %sqrtrecip = fdiv fast double 1.000000e+00, %sqrt
+; CHECK-NEXT: ret double %sqrtrecip
+
+declare double @llvm.pow.f64(double, double)
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index 21c9fdde15069..dfdb88dcc8580 100644
--- a/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -103,3 +103,95 @@ define float @PR22688(float %x) {
   ret float %7
 }
 
+declare float @llvm.fabs.f32(float)
+
+; CHECK-LABEL: @fabs_select_positive_constants(
+; CHECK: %select = select i1 %cmp, float 1.000000e+00, float 2.000000e+00
+; CHECK-NEXT: ret float %select
+define float @fabs_select_positive_constants(i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 1.0, float 2.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_constant_variable(
+; CHECK: %select = select i1 %cmp, float 1.000000e+00, float %x
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_constant_variable(i32 %c, float %x) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 1.0, float %x
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_neg0_pos0(
+; CHECK: %select = select i1 %cmp, float -0.000000e+00, float 0.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+; CHECK-NEXT: ret float %fabs
+define float @fabs_select_neg0_pos0(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -0.0, float 0.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_neg0_neg1(
+; CHECK: %select = select i1 %cmp, float -0.000000e+00, float -1.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_neg0_neg1(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float -0.0, float -1.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_nan_nan(
+; CHECK: %select = select i1 %cmp, float 0x7FF8000000000000, float 0x7FF8000100000000
+; CHECK-NEXT: ret float %select
+define float @fabs_select_nan_nan(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0x7FF8000000000000, float 0x7FF8000100000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_nan(
+; CHECK: %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000000000000
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_nan(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000000000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_negnan(
+; CHECK:  %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000100000000
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_negnan(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float 0x7FF8000100000000
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_negzero(
+; CHECK: %select = select i1 %cmp, float 0xFFF8000000000000, float -0.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_negzero(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float -0.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
+
+; CHECK-LABEL: @fabs_select_negnan_zero(
+; CHECK: %select = select i1 %cmp, float 0xFFF8000000000000, float 0.000000e+00
+; CHECK: %fabs = call float @llvm.fabs.f32(float %select)
+define float @fabs_select_negnan_zero(float addrspace(1)* %out, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, float 0xFFF8000000000000, float 0.0
+  %fabs = call float @llvm.fabs.f32(float %select)
+  ret float %fabs
+}
diff --git a/test/Transforms/LICM/argmemonly-call.ll b/test/Transforms/LICM/argmemonly-call.ll
index 18d7f8351dca9..fe7c6af6d6d91 100644
--- a/test/Transforms/LICM/argmemonly-call.ll
+++ b/test/Transforms/LICM/argmemonly-call.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 declare i32 @foo() readonly argmemonly nounwind
 declare i32 @foo2() readonly nounwind
 declare i32 @bar(i32* %loc2) readonly argmemonly nounwind
diff --git a/test/Transforms/LICM/assume.ll b/test/Transforms/LICM/assume.ll
index f6369ac659f05..c8c93ae89b919 100644
--- a/test/Transforms/LICM/assume.ll
+++ b/test/Transforms/LICM/assume.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -basicaa < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 define void @f_0(i1 %p) nounwind ssp {
 ; CHECK-LABEL: @f_0(
diff --git a/test/Transforms/LICM/atomics.ll b/test/Transforms/LICM/atomics.ll
index 5dcd4bb8c05af..d23cb49c5486e 100644
--- a/test/Transforms/LICM/atomics.ll
+++ b/test/Transforms/LICM/atomics.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 ; Check that we can hoist unordered loads
 define i32 @test1(i32* nocapture %y) nounwind uwtable ssp {
diff --git a/test/Transforms/LICM/basictest.ll b/test/Transforms/LICM/basictest.ll
index 570e226d23722..78c87ce765179 100644
--- a/test/Transforms/LICM/basictest.ll
+++ b/test/Transforms/LICM/basictest.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm | llvm-dis
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
 
 define void @testfunc(i32 %i) {
 ; <label>:0
diff --git a/test/Transforms/LICM/constexpr.ll b/test/Transforms/LICM/constexpr.ll
index 726246776dc62..8ffc735136009 100644
--- a/test/Transforms/LICM/constexpr.ll
+++ b/test/Transforms/LICM/constexpr.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basicaa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 ; This fixes PR22460
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/Transforms/LICM/crash.ll b/test/Transforms/LICM/crash.ll
index 75c27b8def0ce..93ea2192e03e2 100644
--- a/test/Transforms/LICM/crash.ll
+++ b/test/Transforms/LICM/crash.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -disable-output < %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -disable-output < %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LICM/debug-value.ll b/test/Transforms/LICM/debug-value.ll
index ab77caa2bae06..831a0d8b51f94 100644
--- a/test/Transforms/LICM/debug-value.ll
+++ b/test/Transforms/LICM/debug-value.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -basicaa < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 define void @dgefa() nounwind ssp {
 entry:
diff --git a/test/Transforms/LICM/extra-copies.ll b/test/Transforms/LICM/extra-copies.ll
index 84a3bc9ec6a6f..2f8e814c15eea 100644
--- a/test/Transforms/LICM/extra-copies.ll
+++ b/test/Transforms/LICM/extra-copies.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 ; PR19835
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LICM/funclet.ll b/test/Transforms/LICM/funclet.ll
index 9bdc6dbcde887..6b5f11507ed9b 100644
--- a/test/Transforms/LICM/funclet.ll
+++ b/test/Transforms/LICM/funclet.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
 
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i386-pc-windows-msvc18.0.0"
diff --git a/test/Transforms/LICM/hoist-bitcast-load.ll b/test/Transforms/LICM/hoist-bitcast-load.ll
index 5752aecde3871..6ef00738820ef 100644
--- a/test/Transforms/LICM/hoist-bitcast-load.ll
+++ b/test/Transforms/LICM/hoist-bitcast-load.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-deref-load.ll b/test/Transforms/LICM/hoist-deref-load.ll
index ed6ec7694d3c6..e67becdeb5e41 100644
--- a/test/Transforms/LICM/hoist-deref-load.ll
+++ b/test/Transforms/LICM/hoist-deref-load.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-simplify,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-nounwind.ll b/test/Transforms/LICM/hoist-nounwind.ll
index 081729f808bfa..e9720235893a8 100644
--- a/test/Transforms/LICM/hoist-nounwind.ll
+++ b/test/Transforms/LICM/hoist-nounwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='lcssa,require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/LICM/hoist-round.ll b/test/Transforms/LICM/hoist-round.ll
index a87709b810d2b..9c6a3a180b502 100644
--- a/test/Transforms/LICM/hoist-round.ll
+++ b/test/Transforms/LICM/hoist-round.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32"
 
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index c61131b476b9c..29595b3e1cc08 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -lcssa %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S | FileCheck %s
+; RUN: opt -lcssa %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
 
 @X = global i32 0		; <i32*> [#uses=1]
 
diff --git a/test/Transforms/LICM/lcssa-ssa-promoter.ll b/test/Transforms/LICM/lcssa-ssa-promoter.ll
index d466b3baffc8f..0644a627f7186 100644
--- a/test/Transforms/LICM/lcssa-ssa-promoter.ll
+++ b/test/Transforms/LICM/lcssa-ssa-promoter.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basicaa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s| FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s| FileCheck %s
 ;
 ; Manually validate LCSSA form is preserved even after SSAUpdater is used to
 ; promote things in the loop bodies.
diff --git a/test/Transforms/LICM/no-preheader-test.ll b/test/Transforms/LICM/no-preheader-test.ll
index 4b6847cdad51c..5cfa462dfc4a0 100644
--- a/test/Transforms/LICM/no-preheader-test.ll
+++ b/test/Transforms/LICM/no-preheader-test.ll
@@ -1,6 +1,6 @@
 ; Test that LICM works when there is not a loop-preheader
 ; RUN: opt < %s -licm | llvm-dis
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' < %s | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
 
 define void @testfunc(i32 %i.s, i1 %ifcond) {
 	br i1 %ifcond, label %Then, label %Else
diff --git a/test/Transforms/LICM/opt-remarks-conditional-load.ll b/test/Transforms/LICM/opt-remarks-conditional-load.ll
new file mode 100644
index 0000000000000..96bdeaff66ef4
--- /dev/null
+++ b/test/Transforms/LICM/opt-remarks-conditional-load.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+; With the load from %p conditional, we can't optmize this and the remark
+; should tell us about it.
+
+define void @test(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %else]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %c = icmp eq i32 %a, 0
+  br i1 %c, label %then, label %else
+
+then:
+; CHECK: remark: /tmp/kk.c:2:20: failed to hoist load with loop-invariant address because load is conditionally executed
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  br label %else
+
+else:
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
diff --git a/test/Transforms/LICM/opt-remarks-intervening-store.ll b/test/Transforms/LICM/opt-remarks-intervening-store.ll
new file mode 100644
index 0000000000000..95389ceaf9a9a
--- /dev/null
+++ b/test/Transforms/LICM/opt-remarks-intervening-store.ll
@@ -0,0 +1,67 @@
+; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+; Without the noalias on %p, we can't optmize this and the remark should tell
+; us about it.
+
+define void @test(i32* %array, i32* %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:20: failed to move load with loop-invariant address because the loop may invalidate its value
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+; This illustrates why we need to check loop-invariance before issuing this
+; remark.
+
+define i32 @invalidated_load_with_non_loop_invariant_address(i32* %array, i32* %array2) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+
+; CHECK-NOT: /tmp/kk.c:3:20: {{.*}} loop-invariant
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr, !dbg !9
+
+  %addr2 = getelementptr i32, i32* %array2, i32 %j
+  store i32 %j, i32* %addr2
+
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  %a2 = phi i32 [ %a, %Loop ]
+  ret i32 %a2
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 3, column: 20, scope: !6)
diff --git a/test/Transforms/LICM/opt-remarks.ll b/test/Transforms/LICM/opt-remarks.ll
new file mode 100644
index 0000000000000..f0ef386c9f9a2
--- /dev/null
+++ b/test/Transforms/LICM/opt-remarks.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -licm -pass-remarks=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+define void @hoist(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:20: hosting load
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+define i32 @sink(i32* %array, i32* noalias %p, i32 %b) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:21: sinking add
+  %a3 = add i32 %a, 1, !dbg !9
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  %a4 = phi i32 [ %a3, %Loop ]
+  ret i32 %a4
+}
+
+define void @promote(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %b = load i32, i32* %p
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:22: Moving accesses to memory location out of the loop
+  store i32 %b, i32* %p, !dbg !10
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 2, column: 21, scope: !6)
+!10 = !DILocation(line: 2, column: 22, scope: !6)
diff --git a/test/Transforms/LICM/preheader-safe.ll b/test/Transforms/LICM/preheader-safe.ll
index adc4f4237a291..8f82d1c68bb30 100644
--- a/test/Transforms/LICM/preheader-safe.ll
+++ b/test/Transforms/LICM/preheader-safe.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 declare void @use_nothrow(i64 %a) nounwind
 declare void @use(i64 %a)
diff --git a/test/Transforms/LICM/promote-order.ll b/test/Transforms/LICM/promote-order.ll
index 7d87bb221b765..b7e0b7c6c4378 100644
--- a/test/Transforms/LICM/promote-order.ll
+++ b/test/Transforms/LICM/promote-order.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; LICM should keep the stores in their original order when it sinks/promotes them.
 ; rdar://12045203
diff --git a/test/Transforms/LICM/promote-tls.ll b/test/Transforms/LICM/promote-tls.ll
index 1849afade0e4d..076d05cf094a5 100644
--- a/test/Transforms/LICM/promote-tls.ll
+++ b/test/Transforms/LICM/promote-tls.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; If we can prove a local is thread local, we can insert stores during
 ; promotion which wouldn't be legal otherwise.
diff --git a/test/Transforms/LICM/scalar-promote-memmodel.ll b/test/Transforms/LICM/scalar-promote-memmodel.ll
index ceee7292ac5c4..c09c2b361e021 100644
--- a/test/Transforms/LICM/scalar-promote-memmodel.ll
+++ b/test/Transforms/LICM/scalar-promote-memmodel.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basicaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; Make sure we don't hoist a conditionally-executed store out of the loop;
 ; it would violate the concurrency memory model
diff --git a/test/Transforms/LICM/scalar_promote-unwind.ll b/test/Transforms/LICM/scalar_promote-unwind.ll
index 22e7e50c22e57..dd3693b4af635 100644
--- a/test/Transforms/LICM/scalar_promote-unwind.ll
+++ b/test/Transforms/LICM/scalar_promote-unwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basicaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar_promote.ll
index dc5151be8a826..c88701154b8f1 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar_promote.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basicaa -tbaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @X = global i32 7   ; <i32*> [#uses=4]
diff --git a/test/Transforms/LICM/speculate.ll b/test/Transforms/LICM/speculate.ll
index fed1cbaa8555a..5d0108b129df0 100644
--- a/test/Transforms/LICM/speculate.ll
+++ b/test/Transforms/LICM/speculate.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
 
 ; UDiv is safe to speculate if the denominator is known non-zero.
 
diff --git a/test/Transforms/LICM/volatile-alias.ll b/test/Transforms/LICM/volatile-alias.ll
index 7836df004c0fc..f387012015c72 100644
--- a/test/Transforms/LICM/volatile-alias.ll
+++ b/test/Transforms/LICM/volatile-alias.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -basicaa -sroa -loop-rotate -licm -S < %s | FileCheck %s
-; RUN: opt -basicaa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(licm)' -S | FileCheck %s
+; RUN: opt -basicaa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
 ; The objects *p and *q are aliased to each other, but even though *q is
 ; volatile, *p can be considered invariant in the loop. Check if it is moved
 ; out of the loop.
diff --git a/test/Transforms/LoopSimplify/preserve-scev.ll b/test/Transforms/LoopSimplify/preserve-scev.ll
index f6fa8afc56b55..b78ce97fb46ae 100644
--- a/test/Transforms/LoopSimplify/preserve-scev.ll
+++ b/test/Transforms/LoopSimplify/preserve-scev.ll
@@ -1,14 +1,38 @@
-; RUN: opt -S < %s -indvars | opt -analyze -iv-users | grep "%cmp = icmp slt i32" | grep "= {%\.ph,+,1}<%for.cond>"
-; PR8079
+; RUN: opt -S < %s -analyze -scalar-evolution -loop-simplify -scalar-evolution | FileCheck %s
 
 ; Provide legal integer types.
 target datalayout = "n8:16:32:64"
 
-; LoopSimplify should invalidate indvars when splitting out the
-; inner loop.
-
 @maxStat = external global i32
 
+; LoopSimplify should invalidate SCEV when splitting out the
+; inner loop.
+;
+; First SCEV print:
+; CHECK-LABEL: Classifying expressions for: @test
+; CHECK: %[[PHI:.*]] = phi i32 [ 0, %entry ], [ %{{.*}}, %if.then5 ], [ %[[PHI]], %if.end ]
+; CHECK-LABEL: Determining loop execution counts for: @test
+; CHECK: Loop %for.body18: Unpredictable backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %for.cond: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %for.cond: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.cond: Unpredictable predicated backedge-taken count.
+;
+; Now simplify the loop, which should cause SCEV to re-compute more precise
+; info here in addition to having preheader PHIs. Second SCEV print:
+; CHECK-LABEL: Classifying expressions for: @test
+; CHECK: phi i32 [ %{{.*}}, %if.then5 ], [ 0, %entry ]
+; CHECK-LABEL: Determining loop execution counts for: @test
+; CHECK: Loop %for.body18: Unpredictable backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.body18: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %for.cond: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %for.cond: max backedge-taken count is -2147483647
+; CHECK: Loop %for.cond: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %for.cond.outer: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %for.cond.outer: Unpredictable max backedge-taken count.
+; CHECK: Loop %for.cond.outer: Unpredictable predicated backedge-taken count.
 define i32 @test() nounwind {
 entry:
   br label %for.cond
@@ -52,12 +76,27 @@ return:                                           ; preds = %for.body18, %for.bo
 
 declare void @foo() nounwind
 
-; Notify SCEV when removing an ExitingBlock.
-; CHECK-LABEL: @mergeExit(
-; CHECK: while.cond191:
-; CHECK: br i1 %or.cond, label %while.body197
-; CHECK-NOT: land.rhs:
-; CHECK: ret
+; Notify SCEV when removing an ExitingBlock. This only changes the
+; backedge-taken information.
+;
+; First SCEV print:
+; CHECK-LABEL: Determining loop execution counts for: @mergeExit
+; CHECK: Loop %while.cond191: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191: max backedge-taken count is -1
+; CHECK: Loop %while.cond191: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %while.cond191.outer: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable max backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable predicated backedge-taken count.
+;
+; After simplifying, the max backedge count is refined.
+; Second SCEV print:
+; CHECK-LABEL: Determining loop execution counts for: @mergeExit
+; CHECK: Loop %while.cond191: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191: max backedge-taken count is 0
+; CHECK: Loop %while.cond191: Unpredictable predicated backedge-taken count.
+; CHECK: Loop %while.cond191.outer: <multiple exits> Unpredictable backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable max backedge-taken count.
+; CHECK: Loop %while.cond191.outer: Unpredictable predicated backedge-taken count.
 define void @mergeExit(i32 %MapAttrCount) nounwind uwtable ssp {
 entry:
   br i1 undef, label %if.then124, label %if.end126
diff --git a/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index 18309b0691fab..a87d5643e7e9d 100644
--- a/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -3,7 +3,12 @@
 
 ; Make sure we use the profile information correctly to peel-off 3 iterations
 ; from the loop, and update the branch weights for the peeled loop properly.
+
+; CHECK: Loop Unroll: F[basic]
 ; CHECK: PEELING loop %for.body with iteration count 3!
+; CHECK: Loop Unroll: F[optsize]
+; CHECK-NOT: PEELING
+
 ; CHECK-LABEL: @basic
 ; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !1
 ; CHECK: [[NEXT0]]:
@@ -37,6 +42,40 @@ for.end:                                          ; preds = %for.cond.for.end_cr
   ret void
 }
 
+; We don't want to peel loops when optimizing for size.
+; CHECK-LABEL: @optsize
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT: br label %for.body
+; CHECK: for.body:
+; CHECK-NOT: br
+; CHECK: br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+define void @optsize(i32* %p, i32 %k) #1 !prof !0 {
+entry:
+  %cmp3 = icmp slt i32 0, %k
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %p.addr.04 = phi i32* [ %p, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.addr.04, i32 1
+  store i32 %i.05, i32* %p.addr.04, align 4
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %inc, %k
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !1
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind optsize }
+
 !0 = !{!"function_entry_count", i64 1}
 !1 = !{!"branch_weights", i32 3001, i32 1001}
 
diff --git a/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
new file mode 100644
index 0000000000000..ad79e38cafa08
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -0,0 +1,145 @@
+; REQUIRES: asserts
+; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i8 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+; sources of the mul is sext\sext from i8 
+; use pmullw\sext seq.   
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i8
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i8 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i8
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i8 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-120
+; use pmullw\sext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -120, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\250
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 250, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-120
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -120, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\250
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i16 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 1
+  %conv = sext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = sext i16 %1 to i32
+; sources of the mul is sext\sext from i16 
+; use pmulhw\pmullw\pshuf seq.   
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i16
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i16 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i16
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i16 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-32000
+; use pmulhw\pmullw\sext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -32000, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\64000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 64000, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-32000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -32000, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\64000
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
diff --git a/test/Transforms/LoopVectorize/pr31190.ll b/test/Transforms/LoopVectorize/pr31190.ll
new file mode 100644
index 0000000000000..afb1754983cd3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr31190.ll
@@ -0,0 +1,64 @@
+; RUN: opt -passes='loop-vectorize' -debug -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; This checks we don't crash when the inner loop we're trying to vectorize
+; is a SCEV AddRec with respect to an outer loop.
+
+; In this case, the problematic PHI is:
+; %0 = phi i32 [ undef, %for.cond1.preheader ], [ %inc54, %for.body3 ]
+; Since %inc54 is the IV of the outer loop, and %0 equivalent to it,
+; we get the situation described above.
+
+; This test uses the new PM, because with the old PM, running loop-vectorize
+; would explicitly run loop-simplify. Even though this loop is already in
+; simplified form, loop-simplify would still clean up the phi.
+; The reason this matters is that in a real optimizer pipeline, LICM can create
+; such PHIs, and since it preserves loop simplified form, the cleanup has
+; no chance to run.
+
+; Code that leads to this situation can look something like:
+;
+; int a, b[1], c;
+; void fn1 ()
+; {
+;  for (; c; c++)
+;    for (a = 0; a; a++)
+;      b[c] = 4;
+; }
+;
+; The PHI is an artifact of the register promotion of c.
+
+@c = external global i32, align 4
+@a = external global i32, align 4
+@b = external global [1 x i32], align 4
+
+; CHECK: LV: PHI is a recurrence with respect to an outer loop.
+; CHECK: LV: Not vectorizing: Cannot prove legality.
+; CHECK-LABEL: @test
+define void @test() {
+entry:
+  %a.promoted2 = load i32, i32* @a, align 1
+  %c.promoted = load i32, i32* @c, align 1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.for.inc4_crit_edge, %entry
+  %inc54 = phi i32 [ %inc5, %for.cond1.for.inc4_crit_edge ], [ %c.promoted, %entry ]
+  %inc.lcssa3 = phi i32 [ %inc.lcssa, %for.cond1.for.inc4_crit_edge ], [ %a.promoted2, %entry ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %inc1 = phi i32 [ %inc.lcssa3, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %0 = phi i32 [ undef, %for.cond1.preheader ], [ %inc54, %for.body3 ]
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [1 x i32], [1 x i32]* @b, i64 0, i64 %idxprom
+  store i32 4, i32* %arrayidx, align 4
+  %inc = add nsw i32 %inc1, 1
+  %tobool2 = icmp eq i32 %inc, 0
+  br i1 %tobool2, label %for.cond1.for.inc4_crit_edge, label %for.body3
+
+for.cond1.for.inc4_crit_edge:                     ; preds = %for.body3
+  %inc.lcssa = phi i32 [ %inc, %for.body3 ]
+  %.lcssa = phi i32 [ %inc54, %for.body3 ]
+  %inc5 = add nsw i32 %.lcssa, 1
+  br label %for.cond1.preheader
+}
diff --git a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
index d2a3ef81a3a4c..b7a1d208fc6fd 100644
--- a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
+++ b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
@@ -6,5 +6,5 @@ TypeIdMap:
   typeid1:
     TTRes:
       Kind: Unsat
-      SizeBitWidth: 0
+      SizeM1BitWidth: 0
 ...
diff --git a/test/Transforms/LowerTypeTests/function.ll b/test/Transforms/LowerTypeTests/function.ll
index 9abea8f854c18..759041fea6f11 100644
--- a/test/Transforms/LowerTypeTests/function.ll
+++ b/test/Transforms/LowerTypeTests/function.ll
@@ -43,7 +43,7 @@ declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
 define i1 @foo(i8* %p) {
   ; NATIVE: sub i64 {{.*}}, ptrtoint (void ()* @[[JT]] to i64)
   ; WASM32: sub i64 {{.*}}, ptrtoint (i8* getelementptr (i8, i8* null, i64 1) to i64)
-  ; WASM32: icmp ult i64 {{.*}}, 2
+  ; WASM32: icmp ule i64 {{.*}}, 1
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
   ret i1 %x
 }
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 7ca70f2636fd2..7410bc4b4d885 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -10,7 +10,7 @@
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            Unsat
-; SUMMARY-NEXT:       SizeBitWidth:    0
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/Transforms/LowerTypeTests/simple.ll b/test/Transforms/LowerTypeTests/simple.ll
index 91b94184420be..cedfcb4a63a07 100644
--- a/test/Transforms/LowerTypeTests/simple.ll
+++ b/test/Transforms/LowerTypeTests/simple.ll
@@ -69,7 +69,7 @@ define i1 @foo(i32* %p) {
   ; CHECK: [[R3:%[^ ]*]] = lshr i32 [[R2]], 2
   ; CHECK: [[R4:%[^ ]*]] = shl i32 [[R2]], 30
   ; CHECK: [[R5:%[^ ]*]] = or i32 [[R3]], [[R4]]
-  ; CHECK: [[R6:%[^ ]*]] = icmp ult i32 [[R5]], 68
+  ; CHECK: [[R6:%[^ ]*]] = icmp ule i32 [[R5]], 67
   ; CHECK: br i1 [[R6]]
 
   ; CHECK: [[R8:%[^ ]*]] = getelementptr i8, i8* @bits_use.{{[0-9]*}}, i32 [[R5]]
@@ -96,7 +96,7 @@ define i1 @bar(i32* %p) {
   ; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 8
   ; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 24
   ; CHECK: [[S5:%[^ ]*]] = or i32 [[S3]], [[S4]]
-  ; CHECK: [[S6:%[^ ]*]] = icmp ult i32 [[S5]], 2
+  ; CHECK: [[S6:%[^ ]*]] = icmp ule i32 [[S5]], 1
   %x = call i1 @llvm.type.test(i8* %pi8, metadata !"typeid2")
 
   ; CHECK: ret i1 [[S6]]
@@ -112,7 +112,7 @@ define i1 @baz(i32* %p) {
   ; CHECK: [[T3:%[^ ]*]] = lshr i32 [[T2]], 2
   ; CHECK: [[T4:%[^ ]*]] = shl i32 [[T2]], 30
   ; CHECK: [[T5:%[^ ]*]] = or i32 [[T3]], [[T4]]
-  ; CHECK: [[T6:%[^ ]*]] = icmp ult i32 [[T5]], 66
+  ; CHECK: [[T6:%[^ ]*]] = icmp ule i32 [[T5]], 65
   ; CHECK: br i1 [[T6]]
 
   ; CHECK: [[T8:%[^ ]*]] = getelementptr i8, i8* @bits_use{{(\.[0-9]*)?}}, i32 [[T5]]
diff --git a/test/Transforms/NewGVN/assume-equal.ll b/test/Transforms/NewGVN/assume-equal.ll
index b6c2a7afb2943..7e009192064a7 100644
--- a/test/Transforms/NewGVN/assume-equal.ll
+++ b/test/Transforms/NewGVN/assume-equal.ll
@@ -66,22 +66,20 @@ if.then:                                          ; preds = %entry
   %vtable1 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable2.cast = bitcast i8** %vtable1 to i32 (%struct.A*)**
   %call1 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable2.cast, align 8
-; FIXME: those loads could be also direct, but right now the invariant.group
-; analysis works only on single block
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %callx = tail call i32 %call1(%struct.A* %0) #1
   
   %vtable2 = load i8**, i8*** %1, align 8, !invariant.group !0
   %vtable3.cast = bitcast i8** %vtable2 to i32 (%struct.A*)**
   %call4 = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable3.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %cally = tail call i32 %call4(%struct.A* %0) #1
   
   %b = bitcast i8* %call to %struct.A**
   %vtable3 = load %struct.A*, %struct.A** %b, align 8, !invariant.group !0
   %vtable4.cast = bitcast %struct.A* %vtable3 to i32 (%struct.A*)**
   %vfun = load i32 (%struct.A*)*, i32 (%struct.A*)** %vtable4.cast, align 8
-; CHECK-NOT: call i32 @_ZN1A3fooEv(
+; CHECK: call i32 @_ZN1A3fooEv(
   %unknown = tail call i32 %vfun(%struct.A* %0) #1
   
   br label %if.end
diff --git a/test/Transforms/NewGVN/invariant.group.ll b/test/Transforms/NewGVN/invariant.group.ll
index 80c6e05a8e24e..c421df6bd3b15 100644
--- a/test/Transforms/NewGVN/invariant.group.ll
+++ b/test/Transforms/NewGVN/invariant.group.ll
@@ -393,6 +393,45 @@ define void @testNotGlobal() {
    ret void
 }
 
+; CHECK-LABEL: define void @handling_loops()
+define void @handling_loops() {
+  %a = alloca %struct.A, align 8
+  %1 = bitcast %struct.A* %a to i8*
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %2, align 8, !invariant.group !0
+  %3 = load i8, i8* @unknownPtr, align 4
+  %4 = icmp sgt i8 %3, 0
+  br i1 %4, label %.lr.ph.i, label %_Z2g2R1A.exit
+
+.lr.ph.i:                                         ; preds = %0
+  %5 = bitcast %struct.A* %a to void (%struct.A*)***
+  %6 = load i8, i8* @unknownPtr, align 4
+  %7 = icmp sgt i8 %6, 1
+  br i1 %7, label %._crit_edge.preheader, label %_Z2g2R1A.exit
+
+._crit_edge.preheader:                            ; preds = %.lr.ph.i
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.preheader, %._crit_edge
+  %8 = phi i8 [ %10, %._crit_edge ], [ 1, %._crit_edge.preheader ]
+  %.pre = load void (%struct.A*)**, void (%struct.A*)*** %5, align 8, !invariant.group !0
+  %9 = load void (%struct.A*)*, void (%struct.A*)** %.pre, align 8
+; CHECK: call void @_ZN1A3fooEv(%struct.A* nonnull %a)
+  call void %9(%struct.A* nonnull %a) #3
+
+; CHECK-NOT: call void %
+  %10 = add nuw nsw i8 %8, 1
+  %11 = load i8, i8* @unknownPtr, align 4
+  %12 = icmp slt i8 %10, %11
+  br i1 %12, label %._crit_edge, label %_Z2g2R1A.exit.loopexit
+
+_Z2g2R1A.exit.loopexit:                           ; preds = %._crit_edge
+  br label %_Z2g2R1A.exit
+
+_Z2g2R1A.exit:                                    ; preds = %_Z2g2R1A.exit.loopexit, %.lr.ph.i, %0
+  ret void
+}
+
 
 declare void @foo(i8*)
 declare void @foo2(i8*, i8)
diff --git a/test/Transforms/NewGVN/pr31594.ll b/test/Transforms/NewGVN/pr31594.ll
new file mode 100644
index 0000000000000..0cdac1a7fff4a
--- /dev/null
+++ b/test/Transforms/NewGVN/pr31594.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @patatino(i8* %blah, i32 %choice) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[FOO:%.*]] = phi i8* [ [[BLAH:%.*]], [[ENTRY:%.*]] ], [ null, [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    switch i32 [[CHOICE:%.*]], label [[WHILE_BODY]] [
+; CHECK-NEXT:    i32 -1, label [[WHILE_END:%.*]]
+; CHECK-NEXT:    i32 40, label [[LAND_END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       land.end:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.body:
+; CHECK-NEXT:    br label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    store i8 0, i8* [[FOO]], align 1
+; CHECK-NEXT:    store i8 0, i8* [[BLAH]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %foo = phi i8* [ %blah, %entry ], [ null, %while.body ]
+  switch i32 %choice, label %while.body [
+  i32 -1, label %while.end
+  i32 40, label %land.end
+  ]
+
+land.end:
+  br label %while.end
+
+while.body:
+  br label %while.cond
+
+while.end:
+  %foo.lcssa = phi i8* [ %foo, %land.end ], [ %foo, %while.cond ]
+;; These two stores will initially be considered equivalent, but then proven not.
+;; the second store would previously end up deciding it's equivalent to a previous
+;; store, but it was really just finding an optimistic version of itself
+;; in the congruence class.
+  store i8 0, i8* %foo.lcssa, align 1
+  %0 = load i8, i8* %blah, align 1
+  %loaded = icmp eq i8 %0, 0
+  store i8 0, i8* %blah, align 1
+  ret void
+}
+
+
+;; This is an example of a case where the memory states are equivalent solely due to unreachability,
+;; but the stores are not equal.
+define void @foo(i8* %arg) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i8* [ [[ARG:%.*]], [[BB:%.*]] ], [ null, [[BB2:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i8 0, i8* [[TMP]], !g !0
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br i1 undef, label [[BB9:%.*]], label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    switch i8 0, label [[BB6]] [
+; CHECK-NEXT:    i8 6, label [[BB8:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb9:
+; CHECK-NEXT:    store i8 0, i8* [[ARG]], !g !0
+; CHECK-NEXT:    unreachable
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  %tmp = phi i8* [ %arg, %bb ], [ null, %bb2 ]
+  br i1 undef, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb1
+  br label %bb1
+
+bb3:                                              ; preds = %bb1
+  store i8 0, i8* %tmp, !g !0
+  br label %bb4
+
+bb4:                                              ; preds = %bb8, %bb3
+  %tmp5 = phi i8* [ null, %bb8 ], [ %arg, %bb3 ]
+  br label %bb6
+
+bb6:                                              ; preds = %bb7, %bb4
+  br i1 undef, label %bb9, label %bb7
+
+bb7:                                              ; preds = %bb6
+  switch i8 0, label %bb6 [
+  i8 6, label %bb8
+  ]
+
+bb8:                                              ; preds = %bb7
+  store i8 undef, i8* %tmp5, !g !0
+  br label %bb4
+
+bb9:                                              ; preds = %bb6
+  %tmp10 = phi i8* [ %tmp5, %bb6 ]
+  store i8 0, i8* %tmp10, !g !0
+  unreachable
+}
+
+!0 = !{}
diff --git a/test/Transforms/PGOProfile/Inputs/multiple_hash_profile.proftext b/test/Transforms/PGOProfile/Inputs/multiple_hash_profile.proftext
new file mode 100644
index 0000000000000..5bf67fb2bfaf2
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/multiple_hash_profile.proftext
@@ -0,0 +1,36 @@
+# IR level Instrumentation Flag
+:ir
+_Z3fooi
+# Func Hash:
+72057606922829823
+# Num Counters:
+2
+# Counter Values:
+18
+12
+
+_Z3fooi
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+0
+
+_Z3bari
+# Func Hash:
+72057606922829823
+# Num Counters:
+2
+# Counter Values:
+0
+0
+
+_Z4m2f1v
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+1
+
diff --git a/test/Transforms/PGOProfile/comdat_internal.ll b/test/Transforms/PGOProfile/comdat_internal.ll
index 25dafbea1035d..7df6f91fe7297 100644
--- a/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/test/Transforms/PGOProfile/comdat_internal.ll
@@ -4,17 +4,17 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 $foo = comdat any
-; CHECK: $foo.[[FOO_HASH:[0-9]+]] = comdat any
+; CHECK: $foo = comdat any
 
 ; CHECK: $__llvm_profile_raw_version = comdat any
-; CHECK: $__profv__stdin__foo.[[FOO_HASH]] = comdat any
+; CHECK: $__profv__stdin__foo.[[FOO_HASH:[0-9]+]] = comdat any
 
 @bar = global i32 ()* @foo, align 8
 
 ; CHECK: @__llvm_profile_raw_version = constant i64 {{[0-9]+}}, comdat
-; CHECK: @__profn__stdin__foo.[[FOO_HASH]] = private constant [23 x i8] c"<stdin>:foo.[[FOO_HASH]]"
+; CHECK: @__profn__stdin__foo = private constant [11 x i8] c"<stdin>:foo"
 ; CHECK: @__profc__stdin__foo.[[FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
-; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 6965568665848889497, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
+; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 -5640069336071256030, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
 ; CHECK-NOT: bitcast (i32 ()* @foo to i8*)
 ; CHECK-SAME: , i8* null, i32 1, [1 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
diff --git a/test/Transforms/PGOProfile/comdat_rename.ll b/test/Transforms/PGOProfile/comdat_rename.ll
index b69c802093b4c..eb9ddb4a1cea7 100644
--- a/test/Transforms/PGOProfile/comdat_rename.ll
+++ b/test/Transforms/PGOProfile/comdat_rename.ll
@@ -1,7 +1,7 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -pgo-instr-gen -S | FileCheck --check-prefixes COMMON,ELFONLY %s
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=pgo-instr-gen -S | FileCheck --check-prefixes COMMON,ELFONLY %s
-; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -pgo-instr-gen -S | FileCheck --check-prefixes COMMON,COFFONLY %s
-; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -passes=pgo-instr-gen -S | FileCheck --check-prefixes COMMON,COFFONLY %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,ELFONLY %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,ELFONLY %s
+; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,COFFONLY %s
+; RUN: opt < %s -mtriple=x86_64-pc-win32-coff -passes=pgo-instr-gen -do-comdat-renaming=true -S | FileCheck --check-prefixes COMMON,COFFONLY %s
 
 ; Rename Comdat group and its function.
 $f = comdat any
@@ -38,22 +38,10 @@ define linkonce void @tf2() comdat($tf) {
   ret void
 }
 
-; Renaming Comdat with aliases.
-$f_with_alias = comdat any
-; COMMON: $f_with_alias.[[SINGLEBB_HASH]] = comdat any
-@af = alias void (...), bitcast (void ()* @f_with_alias to void (...)*)
-; COFFONLY: @af.[[SINGLEBB_HASH]] = alias void (...), bitcast (void ()* @f_with_alias.[[SINGLEBB_HASH]] to
-; ELFONLY-DAG: @af.[[SINGLEBB_HASH]] = alias void (...), bitcast (void ()* @f_with_alias.[[SINGLEBB_HASH]] to
-define linkonce_odr void @f_with_alias() comdat($f_with_alias) {
-  ret void
-}
-
 ; Rename AvailableExternallyLinkage functions
 ; ELFONLY-DAG: $aef.[[SINGLEBB_HASH]] = comdat any
 
 ; ELFONLY: @f = weak alias void (), void ()* @f.[[SINGLEBB_HASH]]
-; ELFONLY: @f_with_alias = weak alias void (), void ()* @f_with_alias.[[SINGLEBB_HASH]]
-; ELFONLY: @af = weak alias void (...), void (...)* @af.[[SINGLEBB_HASH]]
 ; ELFONLY: @aef = weak alias void (), void ()* @aef.[[SINGLEBB_HASH]]
 
 define available_externally void @aef() {
diff --git a/test/Transforms/PGOProfile/indirect_call_profile.ll b/test/Transforms/PGOProfile/indirect_call_profile.ll
index 409c29ef8728a..e1f499c08a7b2 100644
--- a/test/Transforms/PGOProfile/indirect_call_profile.ll
+++ b/test/Transforms/PGOProfile/indirect_call_profile.ll
@@ -54,7 +54,7 @@ bb11:                                             ; preds = %bb2
 }
 
 ; Test that comdat function's address is recorded.
-; LOWER: @__profd_foo3.[[FOO3_HASH:[0-9]+]] = linkonce_odr{{.*}}@foo3.[[FOO3_HASH]]
+; LOWER: @__profd_foo3.[[FOO3_HASH:[0-9]+]] = linkonce_odr{{.*}}@__profc_foo3.[[FOO3_HASH]]
 ; Function Attrs: nounwind uwtable
 define linkonce_odr i32 @foo3()  comdat  {
   ret i32 1
diff --git a/test/Transforms/PGOProfile/multiple_hash_profile.ll b/test/Transforms/PGOProfile/multiple_hash_profile.ll
new file mode 100644
index 0000000000000..f4041830f8f8f
--- /dev/null
+++ b/test/Transforms/PGOProfile/multiple_hash_profile.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-profdata merge %S/Inputs/multiple_hash_profile.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata  -S | FileCheck %s
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$_Z3fooi = comdat any
+
+@g2 = local_unnamed_addr global i32 (i32)* null, align 8
+
+define i32 @_Z3bari(i32 %i) {
+entry:
+  %cmp = icmp sgt i32 %i, 2
+  %mul = select i1 %cmp, i32 1, i32 %i
+  %retval.0 = mul nsw i32 %mul, %i
+  ret i32 %retval.0
+}
+
+define void @_Z4m2f1v() {
+entry:
+  store i32 (i32)* @_Z3fooi, i32 (i32)** @g2, align 8
+  ret void
+}
+
+define linkonce_odr i32 @_Z3fooi(i32 %i) comdat {
+entry:
+  %cmp.i = icmp sgt i32 %i, 2
+  %mul.i = select i1 %cmp.i, i32 1, i32 %i
+; CHECK: %mul.i = select i1 %cmp.i, i32 1, i32 %i
+; CHECK-SAME !prof ![[BW:[0-9]+]]
+; CHECK ![[BW]] = !{!"branch_weights", i32 12, i32 6}
+  %retval.0.i = mul nsw i32 %mul.i, %i
+  ret i32 %retval.0.i
+}
+
+
diff --git a/test/Transforms/SLPVectorizer/X86/pr31599.ll b/test/Transforms/SLPVectorizer/X86/pr31599.ll
new file mode 100644
index 0000000000000..64e0f7be7e2ef
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr31599.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <2 x float> @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SOURCE:%.*]] = insertelement <2 x float> undef, float undef, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RES2]]
+;
+entry:
+  %source = insertelement <2 x float> undef, float undef, i32 0
+  %e0 = extractelement <2 x float> %source, i32 0
+  %e0.dup = extractelement <2 x float> %source, i32 0
+  %sub1 = fsub float %e0, %e0.dup
+  %e1 = extractelement <2 x float> %source, i32 1
+  %e1.dup = extractelement <2 x float> %source, i32 1
+  %sub2 = fsub float %e1, %e1.dup
+  %res1 = insertelement <2 x float> undef, float %sub1, i32 0
+  %res2 = insertelement <2 x float> %res1, float %sub2, i32 1
+  ret <2 x float> %res2
+}
+
+!llvm.ident = !{!0, !0}
+
+!0 = !{!"clang version 4.0.0 "}
diff --git a/test/Transforms/StructurizeCFG/no-branch-to-entry.ll b/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
index 2e22c8715347e..1db1060ca8212 100644
--- a/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
+++ b/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
+; RUN: opt -S -o - -structurizecfg -verify-dom-info < %s | FileCheck %s
 
 ; CHECK-LABEL: @no_branch_to_entry_undef(
 ; CHECK: entry:
diff --git a/test/tools/llvm-config/booleans.test b/test/tools/llvm-config/booleans.test
new file mode 100644
index 0000000000000..b28f293666f35
--- /dev/null
+++ b/test/tools/llvm-config/booleans.test
@@ -0,0 +1,28 @@
+# Check whether boolean options are consistently normalized to ON/OFF.
+RUN: llvm-config --assertion-mode 2>&1 | FileCheck --check-prefix=CHECK-ONOFF %s
+RUN: llvm-config --has-global-isel 2>&1 | FileCheck --check-prefix=CHECK-ONOFF %s
+CHECK-ONOFF: {{ON|OFF}}
+CHECK-ONOFF-NOT: error:
+CHECK-ONOFF-NOT: warning
+
+# ...or to YES/NO.
+RUN: llvm-config --has-rtti 2>&1 | FileCheck --check-prefix=CHECK-YESNO %s
+CHECK-YESNO: {{YES|NO}}
+CHECK-YESNO-NOT: error:
+CHECK-YESNO-NOT: warning
+
+# Also check some other multi-choice options.
+RUN: llvm-config --build-mode 2>&1 | FileCheck --check-prefix=CHECK-BUILD-MODE %s
+CHECK-BUILD-MODE: {{[Dd][Ee][Bb][Uu][Gg]|[Rr][Ee][Ll][Ee][Aa][Ss][Ee]|[Rr][Ee][Ll][Ww][Ii][Tt][Hh][Dd][Ee][Bb][Ii][Nn][Ff][Oo]|[Mm][Ii][Nn][Ss][Ii][Zz][Ee][Rr][Ee][Ll]}}
+CHECK-BUILD-MODE-NOT: error:
+CHECK-BUILD-MODE-NOT: warning
+
+RUN: llvm-config --build-system 2>&1 | FileCheck --check-prefix=CHECK-BUILD-SYSTEM %s
+CHECK-BUILD-SYSTEM: cmake
+CHECK-BUILD-SYSTEM-NOT: error:
+CHECK-BUILD-SYSTEM-NOT: warning
+
+RUN: llvm-config --shared-mode 2>&1 | FileCheck --check-prefix=CHECK-SHARED-MODE %s
+CHECK-SHARED-MODE: {{static|shared}}
+CHECK-SHARED-MODE-NOT: error:
+CHECK-SHARED-MODE-NOT: warning
diff --git a/test/tools/llvm-xray/X86/Inputs/elf64-objcopied-instrmap.bin b/test/tools/llvm-xray/X86/Inputs/elf64-objcopied-instrmap.bin
new file mode 100755
index 0000000000000..4e1f982e2673d
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/elf64-objcopied-instrmap.bin
diff --git a/test/tools/llvm-xray/X86/Inputs/elf64-sample-o2.bin b/test/tools/llvm-xray/X86/Inputs/elf64-sample-o2.bin
new file mode 100755
index 0000000000000..fe31f9514d37e
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/elf64-sample-o2.bin
diff --git a/test/tools/llvm-xray/X86/Inputs/naive-log-simple.xray b/test/tools/llvm-xray/X86/Inputs/naive-log-simple.xray
new file mode 100644
index 0000000000000..284bc6b63e957
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/naive-log-simple.xray
diff --git a/test/tools/llvm-xray/X86/Inputs/simple-instrmap.yaml b/test/tools/llvm-xray/X86/Inputs/simple-instrmap.yaml
new file mode 100644
index 0000000000000..e9c9f2e8d3c86
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/simple-instrmap.yaml
@@ -0,0 +1,10 @@
+# This is a simple instrumentation map with bogus addresses and offsets, but
+# follow the recommended format.
+---
+- { id: 1, address: 0x1, function: 0x1, kind: function-enter, always-instrument: true}
+- { id: 1, address: 0x2, function: 0x1, kind: function-exit, always-instrument: true}
+- { id: 2, address: 0x2, function: 0x2, kind: function-enter, always-instrument: true}
+- { id: 2, address: 0x3, function: 0x2, kind: function-exit, always-instrument: true}
+- { id: 3, address: 0x3, function: 0x3, kind: function-enter, always-instrument: true}
+- { id: 3, address: 0x4, function: 0x3, kind: function-exit, always-instrument: true}
+...
diff --git a/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml b/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml
new file mode 100644
index 0000000000000..483d3e4f2c8fa
--- /dev/null
+++ b/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml
@@ -0,0 +1,14 @@
+---
+- { id: 1, address: 0x000000000041CA40, function: 0x000000000041CA40, kind: function-enter, 
+    always-instrument: true }
+- { id: 1, address: 0x000000000041CA50, function: 0x000000000041CA40, kind: tail-exit, 
+    always-instrument: true }
+- { id: 2, address: 0x000000000041CA70, function: 0x000000000041CA70, kind: function-enter, 
+    always-instrument: true }
+- { id: 2, address: 0x000000000041CA7C, function: 0x000000000041CA70, kind: tail-exit, 
+    always-instrument: true }
+- { id: 3, address: 0x000000000041CAA0, function: 0x000000000041CAA0, kind: function-enter, 
+    always-instrument: true }
+- { id: 3, address: 0x000000000041CAB4, function: 0x000000000041CAA0, kind: function-exit, 
+    always-instrument: true }
+...
diff --git a/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml b/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml
new file mode 100644
index 0000000000000..6e926974141f3
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml
@@ -0,0 +1,36 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 0
+records:
+# Here we reconstruct the following call trace:
+#
+#   f1()
+#     f2()
+#       f3()
+#
+# But we find that we're missing an exit record for f2() because it's
+# tail-called f3(). We make sure that if we see a trace like this that we can
+# deduce tail calls, and account the time (potentially wrongly) to f2() when
+# f1() exits. That is because we don't go back to f3()'s entry record to
+# properly do the math on the timing of f2().
+#
+# Note that by default, tail/sibling call deduction is disabled, and is enabled
+# with a flag "-d" or "-deduce-sibling-calls".
+#
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit,  tsc: 10003 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,  tsc: 10004 }
+...
+
+#CHECK:       Functions with latencies: 3
+#CHECK-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#CHECK-NEXT:  1 1 [ 4.{{.*}}, 4.{{.*}}, 4.{{.*}}, 4.{{.*}}, 4.{{.*}}] {{.*}} {{.*}}
+#CHECK-NEXT:  2 1 [ 3.{{.*}}, 3.{{.*}}, 3.{{.*}}, 3.{{.*}}, 3.{{.*}}] {{.*}} {{.*}}
+#CHECK-NEXT:  3 1 [ 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/account-keep-going.yaml b/test/tools/llvm-xray/X86/account-keep-going.yaml
new file mode 100644
index 0000000000000..1b234c0d7e8e4
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-keep-going.yaml
@@ -0,0 +1,20 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -k | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 0
+records:
+# We want to test the case for when we see spurious exits, but keep going
+# anyway ignoring the records in the process.
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10000 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }
+  - { type: 0, func-id: 3, cpu: 1, thread: 111, kind: function-exit,  tsc: 10003 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,  tsc: 10004 }
+...
+#CHECK:       Functions with latencies: 1
+#CHECK-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#CHECK-NEXT:  3 1 [ 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}, 1.{{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/account-simple-case.yaml b/test/tools/llvm-xray/X86/account-simple-case.yaml
new file mode 100644
index 0000000000000..82d83aae033ef
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-simple-case.yaml
@@ -0,0 +1,18 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 2601000000
+records:
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10100 }
+...
+
+#CHECK:       Functions with latencies: 1
+#CHECK-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#CHECK-NEXT:  1 1 [ {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/account-simple-sorting.yaml b/test/tools/llvm-xray/X86/account-simple-sorting.yaml
new file mode 100644
index 0000000000000..d25aef24a2721
--- /dev/null
+++ b/test/tools/llvm-xray/X86/account-simple-sorting.yaml
@@ -0,0 +1,85 @@
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml | FileCheck --check-prefix DEFAULT %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s count | FileCheck --check-prefix COUNT-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s min | FileCheck --check-prefix MIN-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s max | FileCheck --check-prefix MAX-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s sum | FileCheck --check-prefix SUM-ASC %s
+
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s count -r dsc | FileCheck --check-prefix COUNT-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s min -r dsc | FileCheck --check-prefix MIN-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s max -r dsc | FileCheck --check-prefix MAX-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s sum -r dsc | FileCheck --check-prefix SUM-DSC %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 1
+records:
+  # Function id: 1
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10100 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10101 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10200 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10201 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10300 }
+  # Function id: 2
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit,
+    tsc: 10002 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter,
+    tsc: 10101 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit,
+    tsc: 10102 }
+
+#DEFAULT:       Functions with latencies: 2
+#DEFAULT-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#DEFAULT-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#DEFAULT-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#COUNT-ASC:       Functions with latencies: 2
+#COUNT-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#COUNT-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#COUNT-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#COUNT-DSC:       Functions with latencies: 2
+#COUNT-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#COUNT-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#COUNT-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MIN-ASC:       Functions with latencies: 2
+#MIN-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MIN-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MIN-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MIN-DSC:       Functions with latencies: 2
+#MIN-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MIN-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MIN-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MAX-ASC:       Functions with latencies: 2
+#MAX-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MAX-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MAX-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#MAX-DSC:       Functions with latencies: 2
+#MAX-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#MAX-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#MAX-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#SUM-ASC:       Functions with latencies: 2
+#SUM-ASC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#SUM-ASC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#SUM-ASC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+
+#SUM-DSC:       Functions with latencies: 2
+#SUM-DSC-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
+#SUM-DSC-NEXT:  1 3 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
+#SUM-DSC-NEXT:  2 2 [{{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}] {{.*}} {{.*}}
diff --git a/test/tools/llvm-xray/X86/bad-instrmap-sizes.bin b/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt
index 4ea33510e5dcc..4ea33510e5dcc 100644
--- a/test/tools/llvm-xray/X86/bad-instrmap-sizes.bin
+++ b/test/tools/llvm-xray/X86/bad-instrmap-sizes.txt
diff --git a/test/tools/llvm-xray/X86/convert-roundtrip.yaml b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
new file mode 100644
index 0000000000000..8444262842643
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
@@ -0,0 +1,28 @@
+#RUN: llvm-xray convert %s -f=raw -o %t && llvm-xray convert %t -f=yaml -o - | FileCheck %s
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 2601000000
+records:
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
+    tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
+    tsc: 10100 }
+...
+
+#CHECK:       ---
+#CHECK-NEXT:  header:
+#CHECK-NEXT:    version: 1
+#CHECK-NEXT:    type: 0
+#CHECK-NEXT:    constant-tsc: true
+#CHECK-NEXT:    nonstop-tsc: true
+#CHECK-NEXT:    cycle-frequency: 2601000000
+#CHECK-NEXT:  records:
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter,
+#CHECK-NEXT:      tsc: 10001 }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit,
+#CHECK-NEXT:      tsc: 10100 }
+#CHECK-NEXT:  ...
diff --git a/test/tools/llvm-xray/X86/convert-to-yaml.txt b/test/tools/llvm-xray/X86/convert-to-yaml.txt
new file mode 100644
index 0000000000000..c402bc18d83dc
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-to-yaml.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
new file mode 100644
index 0000000000000..ddb8b6bdb1cc8
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert -m %S/Inputs/elf64-sample-o2.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
new file mode 100644
index 0000000000000..71c17280df402
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert -m %S/Inputs/elf64-objcopied-instrmap.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-enter, tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-enter, tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-exit, tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-enter, tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-exit, tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697,
+; CHECK-NEXT:       kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
new file mode 100644
index 0000000000000..01191c9c2a31e
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
@@ -0,0 +1,23 @@
+; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml -t yaml %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            0
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 2601000000
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter,
+; CHECK-NEXT:       tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit,
+; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT: ...