80 files changed, 5651 insertions, 1506 deletions
diff --git a/test/Analysis/CostModel/AArch64/bswap.ll b/test/Analysis/CostModel/AArch64/bswap.ll
new file mode 100644
index 000000000000..a97127a631d8
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/bswap.ll
@@ -0,0 +1,70 @@
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+
+; Verify the cost of bswap instructions.
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare i64 @llvm.bswap.i64(i64)
+
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+
+define i16 @bswap_i16(i16 %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_i16':
+; CHECK: Found an estimated cost of 1 for instruction:   %bswap
+  %bswap = tail call i16 @llvm.bswap.i16(i16 %a)
+  ret i16 %bswap
+}
+
+define i32 @bswap_i32(i32 %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_i32':
+; CHECK: Found an estimated cost of 1 for instruction:   %bswap
+  %bswap = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %bswap
+}
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_i64':
+; CHECK: Found an estimated cost of 1 for instruction:   %bswap
+  %bswap = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %bswap
+}
+
+define <2 x i32> @bswap_v2i32(<2 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v2i32':
+; CHECK: Found an estimated cost of 8 for instruction:   %bswap
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+  ret <2 x i32> %bswap
+}
+
+define <4 x i16> @bswap_v4i16(<4 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v4i16':
+; CHECK: Found an estimated cost of 22 for instruction:   %bswap
+  %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %a)
+  ret <4 x i16> %bswap
+}
+
+define <2 x i64> @bswap_v2i64(<2 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v2i64':
+; CHECK: Found an estimated cost of 8 for instruction:   %bswap
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a)
+  ret <2 x i64> %bswap
+}
+
+define <4 x i32> @bswap_v4i32(<4 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v4i32':
+; CHECK: Found an estimated cost of 22 for instruction:   %bswap
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a)
+  ret <4 x i32> %bswap
+}
+
+define <8 x i16> @bswap_v8i16(<8 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v8i16':
+; CHECK: Found an estimated cost of 50 for instruction:   %bswap
+  %bswap = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a)
+  ret <8 x i16> %bswap
+}
diff --git a/test/Analysis/CostModel/AArch64/falkor.ll b/test/Analysis/CostModel/AArch64/falkor.ll
new file mode 100644
index 000000000000..e9563191f077
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/falkor.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -cost-model -analyze -mcpu=falkor | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: vectorInstrCost
+define void @vectorInstrCost() {
+
+    ; Vector extracts - extracting the first element should have a zero cost;
+    ; all other elements should have a cost of two.
+    ;
+    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
+    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
+    %t1 = extractelement <2 x i64> undef, i32 0
+    %t2 = extractelement <2 x i64> undef, i32 1
+
+    ; Vector inserts - inserting the first element should have a zero cost; all
+    ; other elements should have a cost of two.
+    ;
+    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
+    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
+    %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
+    %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
+
+    ret void
+}
diff --git a/test/Analysis/CostModel/AArch64/gep.ll b/test/Analysis/CostModel/AArch64/gep.ll
index f3d83c133027..08bfc3d21238 100644
--- a/test/Analysis/CostModel/AArch64/gep.ll
+++ b/test/Analysis/CostModel/AArch64/gep.ll
@@ -1,9 +1,9 @@
-; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck %s
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
-define i8 @test1(i8* %p, i32 %i) {
+define i8 @test1(i8* %p) {
 ; CHECK-LABEL: test1
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 0
@@ -11,7 +11,7 @@ define i8 @test1(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test2(i16* %p, i32 %i) {
+define i16 @test2(i16* %p) {
 ; CHECK-LABEL: test2
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 0
@@ -19,7 +19,7 @@ define i16 @test2(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test3(i32* %p, i32 %i) {
+define i32 @test3(i32* %p) {
 ; CHECK-LABEL: test3
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 0
@@ -27,7 +27,7 @@ define i32 @test3(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test4(i64* %p, i32 %i) {
+define i64 @test4(i64* %p) {
 ; CHECK-LABEL: test4
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 0
@@ -35,7 +35,7 @@ define i64 @test4(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test5(i8* %p, i32 %i) {
+define i8 @test5(i8* %p) {
 ; CHECK-LABEL: test5
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 1024
@@ -43,7 +43,7 @@ define i8 @test5(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test6(i16* %p, i32 %i) {
+define i16 @test6(i16* %p) {
 ; CHECK-LABEL: test6
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 1024
@@ -51,7 +51,7 @@ define i16 @test6(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test7(i32* %p, i32 %i) {
+define i32 @test7(i32* %p) {
 ; CHECK-LABEL: test7
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 1024
@@ -59,7 +59,7 @@ define i32 @test7(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test8(i64* %p, i32 %i) {
+define i64 @test8(i64* %p) {
 ; CHECK-LABEL: test8
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 1024
@@ -67,7 +67,7 @@ define i64 @test8(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test9(i8* %p, i32 %i) {
+define i8 @test9(i8* %p) {
 ; CHECK-LABEL: test9
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 4096
@@ -75,7 +75,7 @@ define i8 @test9(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test10(i16* %p, i32 %i) {
+define i16 @test10(i16* %p) {
 ; CHECK-LABEL: test10
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 4096
@@ -83,7 +83,7 @@ define i16 @test10(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test11(i32* %p, i32 %i) {
+define i32 @test11(i32* %p) {
 ; CHECK-LABEL: test11
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 4096
@@ -91,7 +91,7 @@ define i32 @test11(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test12(i64* %p, i32 %i) {
+define i64 @test12(i64* %p) {
 ; CHECK-LABEL: test12
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 4096
@@ -99,7 +99,7 @@ define i64 @test12(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test13(i8* %p, i32 %i) {
+define i8 @test13(i8* %p) {
 ; CHECK-LABEL: test13
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -64
@@ -107,7 +107,7 @@ define i8 @test13(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test14(i16* %p, i32 %i) {
+define i16 @test14(i16* %p) {
 ; CHECK-LABEL: test14
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -64
@@ -115,7 +115,7 @@ define i16 @test14(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test15(i32* %p, i32 %i) {
+define i32 @test15(i32* %p) {
 ; CHECK-LABEL: test15
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -64
@@ -123,7 +123,7 @@ define i32 @test15(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test16(i64* %p, i32 %i) {
+define i64 @test16(i64* %p) {
 ; CHECK-LABEL: test16
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -64
@@ -131,7 +131,7 @@ define i64 @test16(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test17(i8* %p, i32 %i) {
+define i8 @test17(i8* %p) {
 ; CHECK-LABEL: test17
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -1024
@@ -139,7 +139,7 @@ define i8 @test17(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test18(i16* %p, i32 %i) {
+define i16 @test18(i16* %p) {
 ; CHECK-LABEL: test18
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -1024
@@ -147,7 +147,7 @@ define i16 @test18(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test19(i32* %p, i32 %i) {
+define i32 @test19(i32* %p) {
 ; CHECK-LABEL: test19
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -1024
@@ -155,7 +155,7 @@ define i32 @test19(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test20(i64* %p, i32 %i) {
+define i64 @test20(i64* %p) {
 ; CHECK-LABEL: test20
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -1024
@@ -195,7 +195,7 @@ define i64 @test24(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test25(i8* %p, i32 %i) {
+define i8 @test25(i8* %p) {
 ; CHECK-LABEL: test25
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -128
@@ -203,7 +203,7 @@ define i8 @test25(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test26(i16* %p, i32 %i) {
+define i16 @test26(i16* %p) {
 ; CHECK-LABEL: test26
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -128
@@ -211,7 +211,7 @@ define i16 @test26(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test27(i32* %p, i32 %i) {
+define i32 @test27(i32* %p) {
 ; CHECK-LABEL: test27
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -128
@@ -219,7 +219,7 @@ define i32 @test27(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test28(i64* %p, i32 %i) {
+define i64 @test28(i64* %p) {
 ; CHECK-LABEL: test28
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -128
@@ -227,7 +227,7 @@ define i64 @test28(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test29(i8* %p, i32 %i) {
+define i8 @test29(i8* %p) {
 ; CHECK-LABEL: test29
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -256
@@ -235,7 +235,7 @@ define i8 @test29(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test30(i16* %p, i32 %i) {
+define i16 @test30(i16* %p) {
 ; CHECK-LABEL: test30
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -256
@@ -243,7 +243,7 @@ define i16 @test30(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test31(i32* %p, i32 %i) {
+define i32 @test31(i32* %p) {
 ; CHECK-LABEL: test31
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -256
@@ -251,7 +251,7 @@ define i32 @test31(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test32(i64* %p, i32 %i) {
+define i64 @test32(i64* %p) {
 ; CHECK-LABEL: test32
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -256
@@ -259,7 +259,7 @@ define i64 @test32(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test33(i8* %p, i32 %i) {
+define i8 @test33(i8* %p) {
 ; CHECK-LABEL: test33
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -512
@@ -267,7 +267,7 @@ define i8 @test33(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test34(i16* %p, i32 %i) {
+define i16 @test34(i16* %p) {
 ; CHECK-LABEL: test34
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -512
@@ -275,7 +275,7 @@ define i16 @test34(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test35(i32* %p, i32 %i) {
+define i32 @test35(i32* %p) {
 ; CHECK-LABEL: test35
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -512
@@ -283,7 +283,7 @@ define i32 @test35(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test36(i64* %p, i32 %i) {
+define i64 @test36(i64* %p) {
 ; CHECK-LABEL: test36
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -512
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
index 7319efb413d6..b7a615f55cde 100644
--- a/test/Analysis/CostModel/X86/arith.ll
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -436,7 +436,7 @@ define i32 @mul(i32 %arg) {
   %A = mul <2 x i64> undef, undef
   ; SSSE3: cost of 16 {{.*}} %B = mul
   ; SSE42: cost of 16 {{.*}} %B = mul
-  ; AVX: cost of 16 {{.*}} %B = mul
+  ; AVX: cost of 18 {{.*}} %B = mul
   ; AVX2: cost of 8 {{.*}} %B = mul
   ; AVX512F: cost of 8 {{.*}} %B = mul
   ; AVX512BW: cost of 8 {{.*}} %B = mul
@@ -444,7 +444,7 @@ define i32 @mul(i32 %arg) {
   %B = mul <4 x i64> undef, undef
   ; SSSE3: cost of 32 {{.*}} %C = mul
   ; SSE42: cost of 32 {{.*}} %C = mul
-  ; AVX: cost of 32 {{.*}} %C = mul
+  ; AVX: cost of 36 {{.*}} %C = mul
   ; AVX2: cost of 16 {{.*}} %C = mul
   ; AVX512F: cost of 8 {{.*}} %C = mul
   ; AVX512BW: cost of 8 {{.*}} %C = mul
diff --git a/test/Analysis/CostModel/X86/shuffle-broadcast.ll b/test/Analysis/CostModel/X86/shuffle-broadcast.ll
index a829a47f89f2..86cf7569a728 100644
--- a/test/Analysis/CostModel/X86/shuffle-broadcast.ll
+++ b/test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -18,14 +18,150 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V256 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V512 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 
   ret void
 }
+
+; CHECK-LABEL: 'test_vXi64'
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
+  ; SSE2: cost of 2 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SSE2: cost of 3 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/vdiv-cost.ll b/test/Analysis/CostModel/X86/vdiv-cost.ll
index c8e4557cbefd..a45bb4b3d0d5 100644
--- a/test/Analysis/CostModel/X86/vdiv-cost.ll
+++ b/test/Analysis/CostModel/X86/vdiv-cost.ll
@@ -1,13 +1,20 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ
 
 define <4 x i32> @test1(<4 x i32> %a) {
   %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
   ret <4 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test1':
-; SSE2: Found an estimated cost of 15 for instruction:   %div
-; AVX2: Found an estimated cost of 15 for instruction:   %div
+; SSE: Found an estimated cost of 15 for instruction:   %div
+; AVX: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i32> @test2(<8 x i32> %a) {
@@ -15,8 +22,10 @@ define <8 x i32> @test2(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test2':
-; SSE2: Found an estimated cost of 30 for instruction:   %div
+; SSE: Found an estimated cost of 30 for instruction:   %div
+; AVX1: Found an estimated cost of 30 for instruction:   %div
 ; AVX2: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i16> @test3(<8 x i16> %a) {
@@ -24,8 +33,9 @@ define <8 x i16> @test3(<8 x i16> %a) {
   ret <8 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test3':
-; SSE2: Found an estimated cost of 6 for instruction:   %div
-; AVX2: Found an estimated cost of 6 for instruction:   %div
+; SSE: Found an estimated cost of 6 for instruction:   %div
+; AVX: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <16 x i16> @test4(<16 x i16> %a) {
@@ -33,8 +43,10 @@ define <16 x i16> @test4(<16 x i16> %a) {
   ret <16 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test4':
-; SSE2: Found an estimated cost of 12 for instruction:   %div
+; SSE: Found an estimated cost of 12 for instruction:   %div
+; AVX1: Found an estimated cost of 12 for instruction:   %div
 ; AVX2: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <8 x i16> @test5(<8 x i16> %a) {
@@ -42,8 +54,9 @@ define <8 x i16> @test5(<8 x i16> %a) {
   ret <8 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test5':
-; SSE2: Found an estimated cost of 6 for instruction:   %div
-; AVX2: Found an estimated cost of 6 for instruction:   %div
+; SSE: Found an estimated cost of 6 for instruction:   %div
+; AVX: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <16 x i16> @test6(<16 x i16> %a) {
@@ -51,8 +64,10 @@ define <16 x i16> @test6(<16 x i16> %a) {
   ret <16 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test6':
-; SSE2: Found an estimated cost of 12 for instruction:   %div
+; SSE: Found an estimated cost of 12 for instruction:   %div
+; AVX1: Found an estimated cost of 12 for instruction:   %div
 ; AVX2: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <16 x i8> @test7(<16 x i8> %a) {
@@ -60,8 +75,9 @@ define <16 x i8> @test7(<16 x i8> %a) {
   ret <16 x i8> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test7':
-; SSE2: Found an estimated cost of 320 for instruction:   %div
-; AVX2: Found an estimated cost of 320 for instruction:   %div
+; SSE: Found an estimated cost of 320 for instruction:   %div
+; AVX: Found an estimated cost of 320 for instruction:   %div
+; AVX512: Found an estimated cost of 320 for instruction:   %div
 }
 
 define <4 x i32> @test8(<4 x i32> %a) {
@@ -69,8 +85,9 @@ define <4 x i32> @test8(<4 x i32> %a) {
   ret <4 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test8':
-; SSE2: Found an estimated cost of 19 for instruction:   %div
-; AVX2: Found an estimated cost of 15 for instruction:   %div
+; SSE: Found an estimated cost of 19 for instruction:   %div
+; AVX: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i32> @test9(<8 x i32> %a) {
@@ -78,8 +95,10 @@ define <8 x i32> @test9(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test9':
-; SSE2: Found an estimated cost of 38 for instruction:   %div
+; SSE: Found an estimated cost of 38 for instruction:   %div
+; AVX1: Found an estimated cost of 38 for instruction:   %div
 ; AVX2: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i32> @test10(<8 x i32> %a) {
@@ -87,6 +106,17 @@ define <8 x i32> @test10(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test10':
-; SSE2: Found an estimated cost of 160 for instruction:   %div
-; AVX2: Found an estimated cost of 160 for instruction:   %div
+; SSE: Found an estimated cost of 160 for instruction:   %div
+; AVX: Found an estimated cost of 160 for instruction:   %div
+; AVX512: Found an estimated cost of 160 for instruction:   %div
+}
+
+define <16 x i32> @test11(<16 x i32> %a) {
+  %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <16 x i32> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test11':
+; SSE: Found an estimated cost of 320 for instruction:   %div
+; AVX: Found an estimated cost of 320 for instruction:   %div
+; AVX512: Found an estimated cost of 320 for instruction:   %div
 }
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index e53e40b57e1d..888164df75f5 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -1,9 +1,12 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK -check-prefix=XOP --check-prefix=XOPAVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK -check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; Verify the cost of vector arithmetic shift right instructions.
 
@@ -17,6 +20,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <2 x i64> %a, %b
   ret <2 x i64> %shift
@@ -28,17 +32,31 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
+; SSE2: Found an estimated cost of 48 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <4 x i32> %a, %b
@@ -51,18 +69,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = ashr <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -74,17 +107,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -96,11 +144,26 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512F: Found an estimated cost of 24 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
+; SSE2: Found an estimated cost of 216 for instruction:   %shift
+; SSE41: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Variable Shifts
 ;
@@ -111,6 +174,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
   %shift = ashr <2 x i64> %a, %splat
@@ -123,18 +187,33 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i64> %a, %splat
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
+; SSE2: Found an estimated cost of 48 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = ashr <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -148,6 +227,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -155,12 +235,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = ashr <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i16> %a, %splat
@@ -173,18 +268,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = ashr <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i8> %a, %splat
@@ -197,12 +308,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
+; SSE2: Found an estimated cost of 216 for instruction:   %shift
+; SSE41: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = ashr <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
 ;
 ; Constant Shifts
 ;
@@ -213,6 +339,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
@@ -224,17 +351,31 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
+; SSE2: Found an estimated cost of 48 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -247,18 +388,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -270,17 +426,32 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -292,11 +463,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
+; SSE2: Found an estimated cost of 216 for instruction:   %shift
+; SSE41: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Constant Shifts
 ;
@@ -307,6 +492,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <2 x i64> %a, <i64 7, i64 7>
   ret <2 x i64> %shift
@@ -318,17 +504,31 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -341,18 +541,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
@@ -364,17 +579,32 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -386,7 +616,21 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
+
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 6d028268ea55..b3382253739f 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -1,9 +1,12 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -17,6 +20,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <2 x i64> %a, %b
@@ -29,18 +33,33 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i32> %a, %b
@@ -53,18 +72,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -76,17 +110,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 12 for instruction:   %shift
+; AVX512: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -98,11 +147,25 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Variable Shifts
 ;
@@ -113,6 +176,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -126,6 +190,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -133,12 +198,27 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = lshr <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -152,6 +232,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -159,12 +240,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = lshr <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i16> %a, %splat
@@ -177,18 +273,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = lshr <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 12 for instruction:   %shift
+; AVX512: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i8> %a, %splat
@@ -201,12 +313,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = lshr <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
 ;
 ; Constant Shifts
 ;
@@ -217,6 +344,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <2 x i64> %a, <i64 1, i64 7>
@@ -229,18 +357,33 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -253,18 +396,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -276,17 +434,32 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 12 for instruction:   %shift
+; AVX512: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -298,11 +471,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Constant Shifts
 ;
@@ -313,6 +500,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <2 x i64> %a, <i64 7, i64 7>
@@ -325,18 +513,33 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -349,18 +552,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
@@ -372,17 +590,32 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -394,7 +627,21 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
+
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 60ba3adea42a..804c5a76c319 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -1,9 +1,12 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; Verify the cost of vector shift left instructions.
 
@@ -18,6 +21,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <2 x i64> %a, %b
@@ -30,18 +34,33 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 10 for instruction:   %shift
 ; SSE41: Found an estimated cost of 10 for instruction:   %shift
 ; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i32> %a, %b
@@ -54,18 +73,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 20 for instruction:   %shift
 ; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
+; SSE2: Found an estimated cost of 40 for instruction:   %shift
+; SSE41: Found an estimated cost of 40 for instruction:   %shift
+; AVX: Found an estimated cost of 40 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -77,17 +111,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -99,11 +148,25 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Variable Shifts
 ;
@@ -114,6 +177,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -127,6 +191,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -134,12 +199,27 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = shl <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 10 for instruction:   %shift
 ; SSE41: Found an estimated cost of 10 for instruction:   %shift
 ; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -153,6 +233,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 20 for instruction:   %shift
 ; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -160,12 +241,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
+; SSE2: Found an estimated cost of 40 for instruction:   %shift
+; SSE41: Found an estimated cost of 40 for instruction:   %shift
+; AVX: Found an estimated cost of 40 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = shl <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i16> %a, %splat
@@ -178,18 +274,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = shl <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i8> %a, %splat
@@ -202,12 +314,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = shl <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
 ;
 ; Constant Shifts
 ;
@@ -218,6 +345,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <2 x i64> %a, <i64 1, i64 7>
@@ -230,18 +358,33 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 6 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -254,18 +397,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
+; SSE2: Found an estimated cost of 24 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -277,18 +435,34 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512F: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -300,11 +474,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Constant Shifts
 ;
@@ -315,6 +503,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <2 x i64> %a, <i64 7, i64 7>
@@ -327,18 +516,33 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -351,18 +555,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
@@ -374,18 +593,34 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512F: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -397,11 +632,25 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Special Cases
 ;
diff --git a/test/Bitcode/summary_version.ll b/test/Bitcode/summary_version.ll
index dfb9e9b15e7b..81025a221bb1 100644
--- a/test/Bitcode/summary_version.ll
+++ b/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=2/>
+; CHECK: <VERSION op0=3/>
 
 
 
diff --git a/test/Bitcode/thinlto-function-summary.ll b/test/Bitcode/thinlto-function-summary.ll
index e42c55c1c2eb..594aaab566d1 100644
--- a/test/Bitcode/thinlto-function-summary.ll
+++ b/test/Bitcode/thinlto-function-summary.ll
@@ -10,7 +10,7 @@
 ; BC-NEXT: <PERMODULE {{.*}} op0=1 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=2 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=3 op1=7
-; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=32
+; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=16
 ; BC-NEXT: <ALIAS {{.*}} op0=5 op1=0 op2=3
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC-NEXT: <VALUE_SYMTAB
diff --git a/test/Bitcode/thinlto-summary-section.ll b/test/Bitcode/thinlto-summary-section.ll
index d120622db819..3d67279617ec 100644
--- a/test/Bitcode/thinlto-summary-section.ll
+++ b/test/Bitcode/thinlto-summary-section.ll
@@ -4,8 +4,10 @@
 ; RUN: llvm-lto -thinlto -o %t2 %t.o
 ; RUN: llvm-bcanalyzer -dump %t2.thinlto.bc | FileCheck %s --check-prefix=COMBINED
 
-; CHECK: <PERMODULE {{.*}} op1=16
-; COMBINED-DAG: <COMBINED {{.*}} op2=16
-define void @functionWithSection() section "some_section" {
+; Flags should be 0x17 (23) for local linkage (0x3) and not being importable
+; (0x10) due to local linkage plus having a section.
+; CHECK: <PERMODULE {{.*}} op1=23
+; COMBINED-DAG: <COMBINED {{.*}} op2=23
+define internal void @functionWithSection() section "some_section" {
     ret void
 }
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
index 22210e49bd77..ece5a858b49c 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
@@ -2836,13 +2836,13 @@ registers:
 
 # CHECK:  body:
 # CHECK:    %wzr = SUBSWrr %0, %0, implicit-def %nzcv
-# CHECK:    %1 = CSINCWr %wzr, %wzr, 0, implicit %nzcv
+# CHECK:    %1 = CSINCWr %wzr, %wzr, 1, implicit %nzcv
 
 # CHECK:    %xzr = SUBSXrr %2, %2, implicit-def %nzcv
-# CHECK:    %3 = CSINCWr %wzr, %wzr, 2, implicit %nzcv
+# CHECK:    %3 = CSINCWr %wzr, %wzr, 3, implicit %nzcv
 
 # CHECK:    %xzr = SUBSXrr %4, %4, implicit-def %nzcv
-# CHECK:    %5 = CSINCWr %wzr, %wzr, 1, implicit %nzcv
+# CHECK:    %5 = CSINCWr %wzr, %wzr, 0, implicit %nzcv
 
 body:             |
   bb.0:
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index e023e32bb7b1..15b4012f383d 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -52,10 +52,10 @@ define void @allocai64() {
 ; CHECK: body:
 ;
 ; ABI/constant lowering and IR-level entry basic block.
-; CHECK: {{bb.[0-9]+}}:
+; CHECK: {{bb.[0-9]+}} (%ir-block.{{[0-9]+}}):
 ;
 ; Make sure we have one successor and only one.
-; CHECK-NEXT: successors: %[[END:bb.[0-9]+]](0x80000000)
+; CHECK-NEXT: successors: %[[END:bb.[0-9]+.end]](0x80000000)
 ;
 ; Check that we emit the correct branch.
 ; CHECK: G_BR %[[END]]
@@ -74,10 +74,10 @@ end:
 ; CHECK: body:
 ;
 ; ABI/constant lowering and IR-level entry basic block.
-; CHECK: {{bb.[0-9]+}}:
+; CHECK: {{bb.[0-9]+}} (%ir-block.{{[0-9]+}}):
 ; Make sure we have two successors
-; CHECK-NEXT: successors: %[[TRUE:bb.[0-9]+]](0x40000000),
-; CHECK:                  %[[FALSE:bb.[0-9]+]](0x40000000)
+; CHECK-NEXT: successors: %[[TRUE:bb.[0-9]+.true]](0x40000000),
+; CHECK:                  %[[FALSE:bb.[0-9]+.false]](0x40000000)
 ;
 ; CHECK: [[ADDR:%.*]](p0) = COPY %x0
 ;
@@ -100,6 +100,74 @@ false:
   ret void
 }
 
+; Tests for switch.
+; This gets lowered to a very straightforward sequence of comparisons for now.
+; CHECK-LABEL: name: switch
+; CHECK: body:
+;
+; CHECK: {{bb.[0-9]+.entry}}:
+; CHECK-NEXT: successors: %[[BB_CASE100:bb.[0-9]+.case100]](0x40000000), %[[BB_NOTCASE100_CHECKNEXT:bb.[0-9]+.entry]](0x40000000)
+; CHECK: %0(s32) = COPY %w0
+; CHECK: %[[reg100:[0-9]+]](s32) = G_CONSTANT i32 100
+; CHECK: %[[reg200:[0-9]+]](s32) = G_CONSTANT i32 200
+; CHECK: %[[reg0:[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: %[[reg1:[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: %[[reg2:[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: %[[regicmp100:[0-9]+]](s1) = G_ICMP intpred(eq), %[[reg100]](s32), %0
+; CHECK: G_BRCOND %[[regicmp100]](s1), %[[BB_CASE100]]
+; CHECK: G_BR %[[BB_NOTCASE100_CHECKNEXT]]
+;
+; CHECK: [[BB_CASE100]]:
+; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
+; CHECK: %[[regretc100:[0-9]+]](s32) = G_ADD %0, %[[reg1]]
+; CHECK: G_BR %[[BB_RET]]
+; CHECK: [[BB_NOTCASE100_CHECKNEXT]]:
+; CHECK-NEXT: successors: %[[BB_CASE200:bb.[0-9]+.case200]](0x40000000), %[[BB_NOTCASE200_CHECKNEXT:bb.[0-9]+.entry]](0x40000000)
+; CHECK: %[[regicmp200:[0-9]+]](s1) = G_ICMP intpred(eq), %[[reg200]](s32), %0
+; CHECK: G_BRCOND %[[regicmp200]](s1), %[[BB_CASE200]]
+; CHECK: G_BR %[[BB_NOTCASE200_CHECKNEXT]]
+;
+; CHECK: [[BB_CASE200]]:
+; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
+; CHECK: %[[regretc200:[0-9]+]](s32) = G_ADD %0, %[[reg2]]
+; CHECK: G_BR %[[BB_RET]]
+; CHECK: [[BB_NOTCASE200_CHECKNEXT]]:
+; CHECK-NEXT: successors: %[[BB_DEFAULT:bb.[0-9]+.default]](0x80000000)
+; CHECK: G_BR %[[BB_DEFAULT]]
+;
+; CHECK: [[BB_DEFAULT]]:
+; CHECK-NEXT: successors: %[[BB_RET]](0x80000000)
+; CHECK: %[[regretdefault:[0-9]+]](s32) = G_ADD %0, %[[reg0]]
+; CHECK: G_BR %[[BB_RET]]
+;
+; CHECK: [[BB_RET]]:
+; CHECK-NEXT: %[[regret:[0-9]+]](s32) = PHI %[[regretdefault]](s32), %[[BB_DEFAULT]], %[[regretc100]](s32), %[[BB_CASE100]]
+; CHECK:  %w0 = COPY %[[regret]](s32)
+; CHECK:  RET_ReallyLR implicit %w0
+define i32 @switch(i32 %argc) {
+entry:
+  switch i32 %argc, label %default [
+    i32 100, label %case100
+    i32 200, label %case200
+  ]
+
+default:
+  %tmp0 = add i32 %argc, 0
+  br label %return
+
+case100:
+  %tmp1 = add i32 %argc, 1
+  br label %return
+
+case200:
+  %tmp2 = add i32 %argc, 2
+  br label %return
+
+return:
+  %res = phi i32 [ %tmp0, %default ], [ %tmp1, %case100 ], [ %tmp2, %case200 ]
+  ret i32 %res
+}
+
 ; Tests for or.
 ; CHECK-LABEL: name: ori64
 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
@@ -223,11 +291,11 @@ define i64* @trivial_bitcast(i8* %a) {
 
 ; CHECK-LABEL: name: trivial_bitcast_with_copy
 ; CHECK:     [[A:%[0-9]+]](p0) = COPY %x0
-; CHECK:     G_BR %[[CAST:bb\.[0-9]+]]
+; CHECK:     G_BR %[[CAST:bb\.[0-9]+.cast]]
 
 ; CHECK: [[CAST]]:
 ; CHECK:     {{%[0-9]+}}(p0) = COPY [[A]]
-; CHECK:     G_BR %[[END:bb\.[0-9]+]]
+; CHECK:     G_BR %[[END:bb\.[0-9]+.end]]
 
 ; CHECK: [[END]]:
 define i64* @trivial_bitcast_with_copy(i8* %a) {
@@ -324,8 +392,8 @@ define void @intrinsics(i32 %cur, i32 %bits) {
 }
 
 ; CHECK-LABEL: name: test_phi
-; CHECK:     G_BRCOND {{%.*}}, %[[TRUE:bb\.[0-9]+]]
-; CHECK:     G_BR %[[FALSE:bb\.[0-9]+]]
+; CHECK:     G_BRCOND {{%.*}}, %[[TRUE:bb\.[0-9]+.true]]
+; CHECK:     G_BR %[[FALSE:bb\.[0-9]+.false]]
 
 ; CHECK: [[TRUE]]:
 ; CHECK:     [[RES1:%[0-9]+]](s32) = G_LOAD
@@ -933,7 +1001,7 @@ define void @test_large_const(i128* %addr) {
 ; correct.
 define i8* @test_const_placement() {
 ; CHECK-LABEL: name: test_const_placement
-; CHECK: bb.{{[0-9]+}}:
+; CHECK: bb.{{[0-9]+}} (%ir-block.{{[0-9]+}}):
 ; CHECK:   [[VAL_INT:%[0-9]+]](s32) = G_CONSTANT i32 42
 ; CHECK:   [[VAL:%[0-9]+]](p0) = G_INTTOPTR [[VAL_INT]](s32)
 ; CHECK:   G_BR
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
index 9051b2388fce..718364af2aca 100644
--- a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
@@ -8,8 +8,8 @@ declare i32 @llvm.eh.typeid.for(i8*)
 
 ; CHECK: name: bar
 ; CHECK: body:
-; CHECK-NEXT:   bb.1:
-; CHECK:     successors: %[[GOOD:bb.[0-9]+]]{{.*}}%[[BAD:bb.[0-9]+]]
+; CHECK-NEXT:   bb.1 (%ir-block.0):
+; CHECK:     successors: %[[GOOD:bb.[0-9]+.continue]]{{.*}}%[[BAD:bb.[0-9]+.broken]]
 ; CHECK:     EH_LABEL
 ; CHECK:     %w0 = COPY
 ; CHECK:     BL @foo, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %w0, implicit-def %w0
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
index 4a3696501fd8..727c189721fa 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O3 -aarch64-enable-collect-loh | FileCheck %s
 ; Check that the LOH analysis does not crash when the analysed chained
 ; contains instructions that are filtered out.
 ;
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index e3df4182ddca..773286ef1d72 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
index b697b6eced3d..c7ba989d933e 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-linux-gnu -O2 | FileCheck %s --check-prefix=CHECK-ELF
 
 ; CHECK-ELF-NOT: .loh
 ; CHECK-ELF-NOT: AdrpAdrp
@@ -633,11 +633,14 @@ define void @setL(<1 x i8> %t) {
 ; a tuple register to appear in the lowering. Thus, the target
 ; cpu is required to have the problem reproduced.
 ; CHECK-LABEL: _uninterestingSub
+; CHECK: [[LOH_LABEL0:Lloh[0-9]+]]:
 ; CHECK: adrp [[ADRP_REG:x[0-9]+]], [[CONSTPOOL:lCPI[0-9]+_[0-9]+]]@PAGE
-; CHECK-NEXT: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
+; CHECK: [[LOH_LABEL1:Lloh[0-9]+]]:
+; CHECK: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
 ; The tuple comes from the next instruction.
 ; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]]
 ; CHECK: ret
+; CHECK: .loh AdrpLdr [[LOH_LABEL0]], [[LOH_LABEL1]]
 define void @uninterestingSub(i8* nocapture %row) #0 {
   %tmp = bitcast i8* %row to <16 x i8>*
   %tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16
@@ -664,10 +667,10 @@ entry:
 if.then.i:
   ret void
 if.end.i:
-; CHECK: .loh AdrpAdrp Lloh91, Lloh93
-; CHECK: .loh AdrpLdr Lloh91, Lloh92
-; CHECK: .loh AdrpLdrGot Lloh93, Lloh95
-; CHECK: .loh AdrpLdrGot Lloh94, Lloh96
+; CHECK: .loh AdrpLdrGot
+; CHECK: .loh AdrpLdrGot
+; CHECK: .loh AdrpAdrp
+; CHECK: .loh AdrpLdr
   %mul.i.i.i = fmul double undef, 1.000000e-06
   %add.i.i.i = fadd double undef, %mul.i.i.i
   %sub.i.i = fsub double %add.i.i.i, undef
diff --git a/test/CodeGen/AArch64/loh.mir b/test/CodeGen/AArch64/loh.mir
new file mode 100644
index 000000000000..1d08ebdc5790
--- /dev/null
+++ b/test/CodeGen/AArch64/loh.mir
@@ -0,0 +1,193 @@
+# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s
+# REQUIRES: asserts
+--- |
+  define void @func0() { ret void }
+
+  declare void @extfunc()
+
+  @g0 = external global i32
+  @g1 = external global i32
+  @g2 = external global i32
+  @g3 = external global i32
+  @g4 = external global i32
+  @g5 = external global i32
+...
+---
+# Check various LOH variants. Remember that the algorithms walks the basic
+# blocks backwards.
+# CHECK-LABEL: ********** AArch64 Collect LOH **********
+# CHECK-LABEL: Looking in function func0
+name: func0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK: Adding MCLOH_AdrpAdrp:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g4>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdrp:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdrp:
+    ; CHECK-NEXT: %X0<def> = ADRP <ga:@g0>
+    ; CHECK-NEXT: %X0<def> = ADRP <ga:@g1>
+    %x0 = ADRP target-flags(aarch64-page) @g0
+    %x0 = ADRP target-flags(aarch64-page) @g1
+    %x1 = ADRP target-flags(aarch64-page) @g2
+    %x1 = ADRP target-flags(aarch64-page) @g3
+    %x1 = ADRP target-flags(aarch64-page) @g4
+
+  bb.1:
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X20<def> = ADRP <ga:@g0>
+    ; CHECK-NEXT: %X3<def> = ADDXri %X20, <ga:@g0>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g0>
+    ; CHECK-NEXT: %X1<def> = ADDXri %X1, <ga:@g0>
+    %x1 = ADRP target-flags(aarch64-page) @g0
+    %x9 = SUBXri undef %x11, 5, 0 ; should not affect MCLOH formation
+    %x1 = ADDXri %x1, target-flags(aarch64-pageoff) @g0, 0
+    %x20 = ADRP target-flags(aarch64-page) @g0
+    BL @extfunc, csr_aarch64_aapcs ; should not clobber X20
+    %x3 = ADDXri %x20, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.2:
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x9 = ADRP target-flags(aarch64-page) @g0
+    BL @extfunc, csr_aarch64_aapcs ; clobbers x9
+    ; Verification requires the use of 'undef' in front of the clobbered %x9
+    %x9 = ADDXri undef %x9, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.3:
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x10 = ADRP target-flags(aarch64-page) @g0
+    HINT 0, implicit def %x10 ; clobbers x10
+    %x10 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.4:
+    ; Cannot produce a LOH for multiple users
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x10 = ADRP target-flags(aarch64-page) @g0
+    HINT 0, implicit def %x10 ; clobbers x10
+    %x11 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+    %x12 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.5:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr:
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %S6<def> = LDRSui %X5, <ga:@g2>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr:
+    ; CHECK-NEXT: %X4<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X4<def> = LDRXui %X4, <ga:@g2>
+    %x4 = ADRP target-flags(aarch64-page) @g2
+    %x4 = LDRXui %x4, target-flags(aarch64-pageoff) @g2
+    %x5 = ADRP target-flags(aarch64-page) @g2
+    %s6 = LDRSui %x5, target-flags(aarch64-pageoff) @g2
+
+  bb.6:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGot:
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X6<def> = LDRXui %X5, <ga:@g2>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGot:
+    ; CHECK-NEXT: %X4<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X4<def> = LDRXui %X4, <ga:@g2>
+    %x4 = ADRP target-flags(aarch64-page, aarch64-got) @g2
+    %x4 = LDRXui %x4, target-flags(aarch64-pageoff, aarch64-got) @g2
+    %x5 = ADRP target-flags(aarch64-page, aarch64-got) @g2
+    %x6 = LDRXui %x5, target-flags(aarch64-pageoff, aarch64-got) @g2
+
+  bb.7:
+    ; CHECK-NOT: Adding MCLOH_AdrpLdrGot:
+    ; Loading a float value from a GOT table makes no sense so this should not
+    ; produce an LOH.
+    %x11 = ADRP target-flags(aarch64-page, aarch64-got) @g5
+    %s11 = LDRSui %x11, target-flags(aarch64-pageoff, aarch64-got) @g5
+
+  bb.8:
+    ; CHECK-NEXT: Adding MCLOH_AdrpAddLdr:
+    ; CHECK-NEXT: %X7<def> = ADRP <ga:@g3>[TF=1]
+    ; CHECK-NEXT: %X8<def> = ADDXri %X7, <ga:@g3>
+    ; CHECK-NEXT: %D1<def> = LDRDui %X8, 8
+    %x7 = ADRP target-flags(aarch64-page) @g3
+    %x8 = ADDXri %x7, target-flags(aarch64-pageoff) @g3, 0
+    %d1 = LDRDui %x8, 8
+
+  bb.9:
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X3<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X3<def> = ADDXri %X3, <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X2<def> = ADDXri %X5, <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAddStr:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X1<def> = ADDXri %X1, <ga:@g3>
+    ; CHECK-NEXT: STRXui %XZR, %X1, 16
+    %x1 = ADRP target-flags(aarch64-page) @g3
+    %x1 = ADDXri %x1, target-flags(aarch64-pageoff) @g3, 0
+    STRXui %xzr, %x1, 16
+
+    ; This sequence should just produce an AdrpAdd (not AdrpAddStr)
+    %x5 = ADRP target-flags(aarch64-page) @g3
+    %x2 = ADDXri %x5, target-flags(aarch64-pageoff) @g3, 0
+    STRXui %x2, undef %x11, 16
+
+    ; This sequence should just produce an AdrpAdd (not AdrpAddStr)
+    %x3 = ADRP target-flags(aarch64-page) @g3
+    %x3 = ADDXri %x3, target-flags(aarch64-pageoff) @g3, 0
+    STRXui %x3, %x3, 16
+
+  bb.10:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr:
+    ; CHECK-NEXT: %X2<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X2<def> = LDRXui %X2, <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGotLdr:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g4>
+    ; CHECK-NEXT: %X1<def> = LDRXui %X1, <ga:@g4>
+    ; CHECK-NEXT: %X1<def> = LDRXui %X1, 24
+    %x1 = ADRP target-flags(aarch64-page, aarch64-got) @g4
+    %x1 = LDRXui %x1, target-flags(aarch64-pageoff, aarch64-got) @g4
+    %x1 = LDRXui %x1, 24
+    ; Should just produce a MCLOH_AdrpLdr (not MCLOH_AdrpLdrGotLdr)
+    %x2 = ADRP target-flags(aarch64-page) @g3
+    %x2 = LDRXui %x2, target-flags(aarch64-pageoff) @g3
+    %x2 = LDRXui %x2, 24
+
+  bb.11:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g1>
+    ; CHECK-NEXT: %X5<def> = LDRXui %X5, <ga:@g1>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGotStr:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g4>
+    ; CHECK-NEXT: %X1<def> = LDRXui %X1, <ga:@g4>
+    ; CHECK-NEXT: STRXui %XZR, %X1, 32
+    %x1 = ADRP target-flags(aarch64-page, aarch64-got) @g4
+    %x1 = LDRXui %x1, target-flags(aarch64-pageoff, aarch64-got) @g4
+    STRXui %xzr, %x1, 32
+    ; Should just produce a MCLOH_AdrpLdr (not MCLOH_AdrpLdrGotStr)
+    %x5 = ADRP target-flags(aarch64-page) @g1
+    %x5 = LDRXui %x5, target-flags(aarch64-pageoff) @g1
+    STRXui undef %x11, %x5, 32
+
+  bb.12:
+    ; CHECK-NOT: MCLOH_AdrpAdrp
+    ; CHECK: Adding MCLOH_AdrpAddLdr
+    ; %X9<def> = ADRP <ga:@g4>
+    ; %X9<def> = ADDXri %X9, <ga:@g4>
+    ; %X5<def> = LDRXui %X9, 0
+    %x9 = ADRP target-flags(aarch64-page, aarch64-got) @g4
+    %x9 = ADDXri %x9, target-flags(aarch64-pageoff, aarch64-got) @g4, 0
+    %x5 = LDRXui %x9, 0
+    %x9 = ADRP target-flags(aarch64-page, aarch64-got) @g5
+
+  bb.13:
+    successors: %bb.14
+    ; Cannot produce a LOH for multiple users
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x10 = ADRP target-flags(aarch64-page) @g0
+    %x11 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+    B %bb.14
+
+  bb.14:
+    liveins: %x10
+    %x12 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+...
diff --git a/test/CodeGen/AArch64/machine-scheduler.mir b/test/CodeGen/AArch64/machine-scheduler.mir
index e7e0dda53c57..933afdb6da9b 100644
--- a/test/CodeGen/AArch64/machine-scheduler.mir
+++ b/test/CodeGen/AArch64/machine-scheduler.mir
@@ -21,8 +21,9 @@
 # CHECK: LDRWui %x0, 0
 # CHECK: LDRWui %x0, 1
 # CHECK: STRWui %w1, %x0, 2
-name:            load_imp-def
-body:             |
+name: load_imp-def
+tracksRegLiveness: true
+body: |
   bb.0.entry:
     liveins: %w1, %x0
     %w8 = LDRWui %x0, 1, implicit-def %x8  :: (load 4 from %ir.0)
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index 28c8b5d73b02..d9662b69b126 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -30,12 +30,11 @@
 ; ELF: Type: Function (0x2)
 ; ELF: }
 
+; HSA: .text
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .text
-
 ; HSA-NOT: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 78a5cdb576f5..12c15441c0f5 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -34,12 +34,12 @@
 ; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
 ; ELF: }
 
+; HSA-NOT: .AMDGPU.config
+; HSA: .text
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .text
-
 ; HSA: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
diff --git a/test/CodeGen/Generic/cfi-sections.ll b/test/CodeGen/Generic/cfi-sections.ll
new file mode 100644
index 000000000000..6e721d6df706
--- /dev/null
+++ b/test/CodeGen/Generic/cfi-sections.ll
@@ -0,0 +1,39 @@
+; When using Itanium ABI, do not emit .debug_frame.
+; RUNT: llc -mtriple=i386--linux -o - < %s | FileCheck %s -check-prefix=WITHOUT
+; RUNT: llc -mtriple=armv7-netbsd-eabi -o - < %s | FileCheck %s -check-prefix=WITHOUT
+
+; When using EHABI, do emit .debug_frame.
+; RUN: llc -mtriple=arm-linux -mcpu=cortex-a7 -mattr=v7 -o - < %s | FileCheck %s -check-prefix=WITH
+
+; REQUIRES: x86-registered-target
+; REQUIRES: arm-registered-target
+
+; WITH:        .cfi_sections .debug_frame
+; WITHOUT-NOT: .cfi_sections
+
+define i32 @foo() #0 !dbg !7 {
+  %1 = call i32 @bar()
+  %2 = call i32 @bar()
+  %3 = add nsw i32 %1, %2
+  ret i32 %3
+}
+
+declare i32 @bar() #1
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "cfi-sections.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, !"min_enum_size", i32 4}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/MIR/AArch64/spill-fold.mir b/test/CodeGen/MIR/AArch64/spill-fold.mir
new file mode 100644
index 000000000000..05e7f7521ed5
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/spill-fold.mir
@@ -0,0 +1,82 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass greedy -verify-machineinstrs  -o - %s | FileCheck %s
+--- |
+  define i64 @test_subreg_spill_fold() { ret i64 0 }
+  define i64 @test_subreg_spill_fold2() { ret i64 0 }
+  define i64 @test_subreg_spill_fold3() { ret i64 0 }
+  define i64 @test_subreg_fill_fold() { ret i64 0 }
+  define double @test_subreg_fill_fold2() { ret double 0.0 }
+...
+---
+# CHECK-LABEL: name: test_subreg_spill_fold
+# Ensure that the spilled subreg COPY is eliminated and folded into the spill store.
+name:            test_subreg_spill_fold
+registers:
+  - { id: 0, class: gpr64 }
+body:             |
+  bb.0:
+    ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0)
+    undef %0.sub_32 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    %x0 = COPY %0
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_spill_fold2
+# Similar to test_subreg_spill_fold, but with a vreg0 register class not containing %WZR.
+name:            test_subreg_spill_fold2
+registers:
+  - { id: 0, class: gpr64sp }
+body:             |
+  bb.0:
+    ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0)
+    undef %0.sub_32 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    %x0 = ADDXri %0, 1, 0
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_spill_fold3
+# Similar to test_subreg_spill_fold, but with a cross register class copy.
+name:            test_subreg_spill_fold3
+registers:
+  - { id: 0, class: fpr64 }
+body:             |
+  bb.0:
+    ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0)
+    undef %0.ssub = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %d0, 12, implicit-def dead %d1, 12, implicit-def dead %d2, 12, implicit-def dead %d3, 12, implicit-def dead %d4, 12, implicit-def dead %d5, 12, implicit-def dead %d6, 12, implicit-def dead %d7, 12, implicit-def dead %d8, 12, implicit-def dead %d9, 12, implicit-def dead %d10, 12, implicit-def dead %d11, 12, implicit-def dead %d12, 12, implicit-def dead %d13, 12, implicit-def dead %d14, 12, implicit-def dead %d15, 12, implicit-def dead %d16, 12, implicit-def dead %d17, 12, implicit-def dead %d18, 12, implicit-def dead %d19, 12, implicit-def dead %d20, 12, implicit-def dead %d21, 12, implicit-def dead %d22, 12, implicit-def dead %d23, 12, implicit-def dead %d24, 12, implicit-def dead %d25, 12, implicit-def dead %d26, 12, implicit-def dead %d27, 12, implicit-def dead %d28, 12, implicit-def dead %d29, 12, implicit-def dead %d30, 12, implicit-def %d31
+    %x0 = COPY %0
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_fill_fold
+# Ensure that the filled COPY is eliminated and folded into the fill load.
+name:            test_subreg_fill_fold
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: gpr64 }
+body:             |
+  bb.0:
+    %0 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    ; CHECK: undef %1.sub_32 = LDRWui %stack.0, 0 :: (load 4 from %stack.0)
+    undef %1.sub_32 = COPY %0
+    %x0 = COPY %1
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_fill_fold2
+# Similar to test_subreg_fill_fold, but with a cross-class copy.
+name:            test_subreg_fill_fold2
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: fpr64 }
+body:             |
+  bb.0:
+    %0 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    ; CHECK: undef %1.ssub = LDRSui %stack.0, 0 :: (load 4 from %stack.0)
+    undef %1.ssub = COPY %0
+    %d0 = COPY %1
+    RET_ReallyLR implicit %d0
+...
diff --git a/test/CodeGen/MIR/X86/basic-block-liveins.mir b/test/CodeGen/MIR/X86/basic-block-liveins.mir
index 35f5512936ba..b347368a94b1 100644
--- a/test/CodeGen/MIR/X86/basic-block-liveins.mir
+++ b/test/CodeGen/MIR/X86/basic-block-liveins.mir
@@ -22,7 +22,8 @@
 
 ...
 ---
-name:            test
+name: test
+tracksRegLiveness: true
 body: |
   ; CHECK-LABEL: bb.0.body:
   ; CHECK-NEXT:    liveins: %edi, %esi
@@ -33,7 +34,8 @@ body: |
     RETQ %eax
 ...
 ---
-name:            test2
+name: test2
+tracksRegLiveness: true
 body: |
   ; CHECK-LABEL: name: test2
   ; Verify that we can have multiple lists of liveins that will be merged into
@@ -48,7 +50,8 @@ body: |
     RETQ %eax
 ...
 ---
-name:            test3
+name: test3
+tracksRegLiveness: true
 body: |
   ; Verify that we can have an empty list of liveins.
   ; CHECK-LABEL: name: test3
diff --git a/test/CodeGen/MIR/X86/machine-verifier.mir b/test/CodeGen/MIR/X86/machine-verifier.mir
index c56bab8c998c..7421146c22ed 100644
--- a/test/CodeGen/MIR/X86/machine-verifier.mir
+++ b/test/CodeGen/MIR/X86/machine-verifier.mir
@@ -10,7 +10,8 @@
 
 ...
 ---
-name:            inc
+name: inc
+tracksRegLiveness: true
 body: |
   bb.0.entry:
     liveins: %edi
diff --git a/test/CodeGen/NVPTX/tid-range.ll b/test/CodeGen/NVPTX/tid-range.ll
new file mode 100644
index 000000000000..3dc4008810a1
--- /dev/null
+++ b/test/CodeGen/NVPTX/tid-range.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+declare i32 @get_register()
+
+define i1 @test1() {
+entry:
+  %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !0
+  %cmp = icmp eq i32 %call, 1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: test1(
+; CHECK: setp.eq.s32  %p1, %r1, 1;
+; CHECK: selp.u32     %[[R:.+]], 1, 0, %p1;
+; CHECK: st.param.b32 [func_retval0+0], %[[R]];
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+
+!0 = !{ i32 0, i32 3 }
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
index 6cda38aa94fe..425d2609380e 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
@@ -24,7 +24,7 @@ define void @test_void_return() {
 ; CHECK-NEXT:   hasVAStart:      false
 ; CHECK-NEXT:   hasMustTailInVarArgFunc: false
 ; CHECK-NEXT: body:
-; CHECK-NEXT:   bb.1:
+; CHECK-NEXT:   bb.1.entry:
 ; CHECK-NEXT:     RET 0
 entry:
   ret void
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 3c649e18bc38..8590d641a4c5 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -2902,6 +2902,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <
 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
 ; CHECK-LABEL: test_mask_vextracti64x4:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    kshiftlw $12, %k1, %k0
 ; CHECK-NEXT:    kshiftrw $15, %k0, %k0
@@ -2923,7 +2924,7 @@ define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask)
 ; CHECK-NEXT:    vpmovsxdq %xmm2, %ymm2
 ; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask)
   ret <4 x i64> %res
 }
 
@@ -2963,9 +2964,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x
 define <4 x double> @test_vextractf64x4(<8 x double> %a) {
 ; CHECK-LABEL: test_vextractf64x4:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
+  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1)
   ret <4 x double> %res
 }
 
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index 646697b82c2d..04d21ecd3e82 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -500,3 +500,110 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
     store <8 x i8> %x, <8 x i8>* %res
     ret void
 }
+
+
+define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
+; KNL-LABEL: usat_trunc_wb_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vmovdqu %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
+; KNL-LABEL: usat_trunc_wb_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  ret <16 x i8> %x6
+}
+
+define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
+; KNL-LABEL: usat_trunc_wb_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <8 x i16> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
+; ALL-LABEL: usat_trunc_db_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <16 x i32> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
+; ALL-LABEL: usat_trunc_qb_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x6 = trunc <8 x i64> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
+; ALL-LABEL: usat_trunc_qd_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x6 = trunc <8 x i64> %x5 to <8 x i32>
+  store <8 x i32> %x6, <8 x i32>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
+; ALL-LABEL: usat_trunc_qw_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x6 = trunc <8 x i64> %x5 to <8 x i16>
+  store <8 x i16> %x6, <8 x i16>* %res, align 1
+  ret void
+}
+
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index a961dbac7dd8..8e9bc8b5af4b 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -156,3 +156,21 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
   %d = select i1 %c, i8 %a, i8 %b
   ret i8 %d
 }
+
+; FIXME: The 'not' is redundant.
+
+define i32 @smin(i32 %x) {
+; CHECK-LABEL: smin:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    xorl $-1, %edi
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    cmovsl %ecx, %eax
+; CHECK-NEXT:    retq
+  %not_x = xor i32 %x, -1
+  %1 = icmp slt i32 %not_x, -1
+  %sel = select i1 %1, i32 %not_x, i32 -1
+  ret i32 %sel
+}
+
diff --git a/test/CodeGen/X86/lower-vec-shift-2.ll b/test/CodeGen/X86/lower-vec-shift-2.ll
index 6ca76c2e7e40..a617f44d3f98 100644
--- a/test/CodeGen/X86/lower-vec-shift-2.ll
+++ b/test/CodeGen/X86/lower-vec-shift-2.ll
@@ -12,8 +12,7 @@ define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX-LABEL: test1:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -32,8 +31,7 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -68,8 +66,7 @@ define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX-LABEL: test4:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -88,8 +85,7 @@ define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
 ;
 ; AVX-LABEL: test5:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -124,8 +120,7 @@ define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX-LABEL: test7:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -144,8 +139,7 @@ define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
 ;
 ; AVX-LABEL: test8:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
new file mode 100644
index 000000000000..f9fe97b21ee3
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -0,0 +1,481 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i8_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v8i16_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i16_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <8 x i16>
+  %strided.vec = trunc <8 x i16> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i16_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v4i32_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i32_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %bc = bitcast <8 x i16> %vec to <4 x i32>
+  %strided.vec = trunc <4 x i32> %bc to <4 x i16>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
+; AVX-LABEL: shuffle_v4i32_to_v2i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <4 x i32>, <4 x i32>* %L
+  %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  store <2 x i32> %strided.vec, <2 x i32>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
+; AVX-LABEL: trunc_v2i64_to_v2i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <4 x i32>, <4 x i32>* %L
+  %bc = bitcast <4 x i32> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i32>
+  store <2 x i32> %strided.vec, <2 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i8_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v4i32_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i32_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <4 x i32>
+  %strided.vec = trunc <4 x i32> %bc to <4 x i8>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i16_to_v2i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+  store <2 x i16> %strided.vec, <2 x i16>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v2i64_to_v2i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %bc = bitcast <8 x i16> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i16>
+  store <2 x i16> %strided.vec, <2 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i8_to_v2i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
+  store <2 x i8> %strided.vec, <2 x i8>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v2i64_to_v2i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i8>
+  store <2 x i8> %strided.vec, <2 x i8>* %S
+  ret void
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
new file mode 100644
index 000000000000..893f96e6fb22
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -0,0 +1,629 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v16i16_to_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v16i16_to_v16i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <16 x i16>
+  %strided.vec = trunc <16 x i16> %bc to <16 x i8>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i16_to_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v8i32_to_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %bc = bitcast <16 x i16> %vec to <8 x i32>
+  %strided.vec = trunc <8 x i32> %bc to <8 x i16>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i32_to_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512F-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i32>, <8 x i32>* %L
+  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i32> %strided.vec, <4 x i32>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
+; AVX-LABEL: trunc_v4i64_to_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i32>, <8 x i32>* %L
+  %bc = bitcast <8 x i32> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i32>
+  store <4 x i32> %strided.vec, <4 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v8i32_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <8 x i32>
+  %strided.vec = trunc <8 x i32> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i16_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v4i64_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %bc = bitcast <16 x i16> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i16>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v4i64_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i8>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
new file mode 100644
index 000000000000..923290411ae3
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -0,0 +1,537 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BWVL-NEXT:    vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+  store <32 x i8> %strided.vec, <32 x i8>* %S
+  ret void
+}
+
+define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
+; AVX512F-LABEL: trunc_v32i16_to_v32i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxwd 32(%rdi), %zmm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxwd 32(%rdi), %zmm1
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <32 x i16>
+  %strided.vec = trunc <32 x i16> %bc to <32 x i8>
+  store <32 x i8> %strided.vec, <32 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqu {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
+; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
+; AVX512BWVL-NEXT:    vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  store <16 x i16> %strided.vec, <16 x i16>* %S
+  ret void
+}
+
+define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
+; AVX512-LABEL: trunc_v16i32_to_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %bc = bitcast <32 x i16> %vec to <16 x i32>
+  %strided.vec = trunc <16 x i32> %bc to <16 x i16>
+  store <16 x i16> %strided.vec, <16 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
+; AVX512-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i32>, <16 x i32>* %L
+  %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i32> %strided.vec, <8 x i32>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i32>, <16 x i32>* %L
+  %bc = bitcast <16 x i32> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i32>
+  store <8 x i32> %strided.vec, <8 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm1
+; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX512BWVL-NEXT:    vmovd %ecx, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512-LABEL: trunc_v16i32_to_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <16 x i32>
+  %strided.vec = trunc <16 x i32> %bc to <16 x i8>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT:    vmovd %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT:    vmovd %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vmovd %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vmovd %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %bc = bitcast <32 x i16> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i16>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm1, %r8d
+; AVX512BW-NEXT:    vpextrb $0, %xmm1, %r9d
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm1, %r10d
+; AVX512BW-NEXT:    vpextrb $0, %xmm1, %r11d
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %edx
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %edi
+; AVX512BW-NEXT:    vpinsrb $0, %edi, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $4, %r11d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
diff --git a/test/CodeGen/X86/tail-call-conditional.mir b/test/CodeGen/X86/tail-call-conditional.mir
index af6e95d46107..75cb1e451d83 100644
--- a/test/CodeGen/X86/tail-call-conditional.mir
+++ b/test/CodeGen/X86/tail-call-conditional.mir
@@ -26,7 +26,8 @@
 
 ...
 ---
-name:            test
+name: test
+tracksRegLiveness: true
 liveins:
   - { reg: '%rdi' }
   - { reg: '%rsi' }
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 50febd4c1ec7..fbb67ebbf60c 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -87,14 +87,12 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
 ; X32-SSE-NEXT:    psllq %xmm3, %xmm4
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm3
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
-; X32-SSE-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
 ; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; X32-SSE-NEXT:    orpd %xmm4, %xmm1
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index fc67914015b5..27b65b829923 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
@@ -80,7 +80,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX512-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
 ; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
@@ -90,20 +90,19 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    psrlq %xmm2, %xmm4
-; X32-SSE-NEXT:    movq {{.*#+}} xmm5 = xmm1[0],zero
-; X32-SSE-NEXT:    psrlq %xmm5, %xmm3
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    psrlq %xmm2, %xmm1
-; X32-SSE-NEXT:    psrlq %xmm5, %xmm0
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT:    xorpd %xmm4, %xmm1
-; X32-SSE-NEXT:    psubq %xmm4, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; X32-SSE-NEXT:    xorpd %xmm4, %xmm2
+; X32-SSE-NEXT:    psubq %xmm4, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
 ; X32-SSE-NEXT:    retl
   %shift = ashr <2 x i64> %a, %b
   ret <2 x i64> %shift
@@ -189,7 +188,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -323,11 +322,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
@@ -499,7 +498,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -627,7 +626,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
@@ -637,7 +636,6 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
@@ -659,29 +657,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; SSE41-NEXT:    psrad %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psrad %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v4i32:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -706,29 +700,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT:    psraw %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psraw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -919,7 +909,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
@@ -1066,7 +1056,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -1150,7 +1140,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1232,11 +1222,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1393,7 +1383,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
@@ -1528,7 +1518,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $7, %xmm0, %xmm1
 ; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
@@ -1564,7 +1554,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1593,7 +1583,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1632,7 +1622,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 5725fcb8c121..ee1879b6696e 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
@@ -66,7 +66,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
 ; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -131,7 +131,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, %b
@@ -213,11 +213,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -332,7 +332,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -411,7 +411,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
@@ -426,9 +426,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -436,16 +435,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -453,15 +450,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -473,8 +468,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -482,16 +476,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; AVX2-LABEL: splatvar_shift_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -499,15 +491,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -619,7 +609,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -702,7 +692,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
@@ -750,7 +740,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -815,11 +805,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
@@ -924,7 +914,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -998,7 +988,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $7, %ymm0, %ymm1
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
@@ -1035,7 +1025,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1070,7 +1060,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1123,7 +1113,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 27ff134fd109..1280641c557b 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
 
 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: var_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <8 x i64> %a, %b
@@ -16,7 +16,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: var_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <16 x i32> %a, %b
@@ -25,7 +25,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
@@ -48,7 +48,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i16> %a, %b
@@ -57,7 +57,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 
 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -109,100 +109,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -212,11 +212,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -224,85 +224,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -310,17 +310,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -328,89 +328,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -418,86 +418,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -514,7 +514,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -524,9 +524,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v16i32:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; ALL:       # BB#0:
+; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; ALL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -536,17 +535,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
 
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsraw %xmm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -556,7 +553,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
@@ -602,101 +599,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -706,11 +703,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -718,85 +715,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -804,17 +801,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -822,89 +819,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -912,86 +909,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -1009,7 +1006,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: constant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravq {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -1018,7 +1015,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: constant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -1027,7 +1024,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
@@ -1049,7 +1046,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1058,7 +1055,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
@@ -1104,7 +1101,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512BW-NEXT:    vmovd %eax, %xmm2
@@ -1362,7 +1359,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsraq $7, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -1371,7 +1368,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrad $5, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1380,13 +1377,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw $3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1395,7 +1392,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -1409,7 +1406,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1422,14 +1419,14 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) {
 ; AVX512DQ-LABEL: ashr_const7_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ashr_const7_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 0dab815d4d49..42488f2ec3a7 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -60,7 +60,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -69,7 +69,6 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
 ; X32-SSE-NEXT:    movapd %xmm2, %xmm0
@@ -158,7 +157,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -292,11 +291,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
@@ -417,7 +416,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
@@ -487,13 +486,12 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -511,29 +509,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; SSE41-NEXT:    psrld %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psrld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v4i32:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -558,29 +552,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT:    psrlw %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -709,7 +699,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
@@ -810,7 +800,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -884,7 +874,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -966,11 +956,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1073,7 +1063,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
@@ -1145,7 +1135,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1174,7 +1164,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1203,7 +1193,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1236,7 +1226,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 09822ee6c614..5223d7bba353 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
@@ -47,7 +47,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, %b
@@ -108,7 +108,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, %b
@@ -190,11 +190,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -276,7 +276,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -326,7 +326,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -337,9 +337,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -347,16 +346,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -364,15 +361,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -384,8 +379,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -393,16 +387,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; AVX2-LABEL: splatvar_shift_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -410,15 +402,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -501,7 +491,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -560,7 +550,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
@@ -605,7 +595,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -670,11 +660,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
@@ -750,7 +740,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
@@ -801,7 +791,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -836,7 +826,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -871,7 +861,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -913,7 +903,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 06bf12a621a1..4c3caf329fb7 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Variable Shifts
@@ -8,7 +8,7 @@
 
 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: var_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <8 x i64> %a, %b
@@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: var_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <16 x i32> %a, %b
@@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
@@ -49,7 +49,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i16> %a, %b
@@ -58,7 +58,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 
 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
@@ -89,100 +89,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -192,11 +192,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -204,85 +204,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -290,17 +290,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -308,89 +308,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -398,86 +398,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -494,7 +494,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -504,9 +504,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v16i32:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; ALL:       # BB#0:
+; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; ALL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -516,17 +515,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
 
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -536,7 +533,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -565,101 +562,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -669,11 +666,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -681,85 +678,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -767,17 +764,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -785,89 +782,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -875,86 +872,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -972,7 +969,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: constant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -981,7 +978,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: constant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -990,7 +987,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
@@ -1012,7 +1009,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1021,7 +1018,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
@@ -1050,7 +1047,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512BW-NEXT:    vmovd %eax, %xmm2
@@ -1308,7 +1305,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlq $7, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -1317,7 +1314,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrld $5, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1326,13 +1323,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1341,7 +1338,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -1350,7 +1347,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index ec2e61d3ca02..5c89949e924b 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -58,7 +58,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -67,7 +67,6 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    psllq %xmm3, %xmm2
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm0
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
 ; X32-SSE-NEXT:    movapd %xmm2, %xmm0
@@ -124,7 +123,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -247,11 +246,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
@@ -367,7 +366,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
@@ -435,13 +434,12 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -459,29 +457,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; SSE41-NEXT:    pslld %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pslld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v4i32:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOP-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -506,29 +500,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT:    psllw %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOP-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -650,7 +640,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
@@ -747,7 +737,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -802,7 +792,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -838,11 +828,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
@@ -928,7 +918,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
@@ -998,7 +988,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1027,7 +1017,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1056,7 +1046,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1087,7 +1077,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 00d040633014..eb52ae3ccaca 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Variable Shifts
@@ -45,7 +45,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, %b
@@ -89,7 +89,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, %b
@@ -165,11 +165,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -241,7 +241,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -290,7 +290,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -301,9 +301,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -311,16 +310,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -328,15 +325,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -348,8 +343,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -357,16 +351,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; AVX2-LABEL: splatvar_shift_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -374,15 +366,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -457,7 +447,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -512,7 +502,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
@@ -547,7 +537,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -582,11 +572,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
@@ -656,7 +646,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
@@ -706,7 +696,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -741,7 +731,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpslld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -776,7 +766,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -817,7 +807,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index eb1309d9bb01..520c3237a57f 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Variable Shifts
@@ -8,7 +8,7 @@
 
 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: var_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <8 x i64> %a, %b
@@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: var_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <16 x i32> %a, %b
@@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
@@ -49,7 +49,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = shl <32 x i16> %a, %b
@@ -58,7 +58,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 
 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
@@ -86,100 +86,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -189,11 +189,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -201,85 +201,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -287,17 +287,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -305,89 +305,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -395,86 +395,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -491,7 +491,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -501,9 +501,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v16i32:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; ALL:       # BB#0:
+; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; ALL-NEXT:    vpslld %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -513,17 +512,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
 
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -533,7 +530,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -559,101 +556,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -663,11 +660,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -675,85 +672,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -761,17 +758,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -779,89 +776,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -869,86 +866,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -966,7 +963,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: constant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -975,7 +972,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: constant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -984,14 +981,14 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1000,7 +997,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
@@ -1026,7 +1023,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512BW-NEXT:    vmovd %eax, %xmm2
@@ -1284,7 +1281,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllq $7, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -1293,7 +1290,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpslld $5, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1302,13 +1299,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw $3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1317,7 +1314,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -1326,7 +1323,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index cad8f85395d5..2aab77433dfb 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -702,17 +702,11 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE41-NEXT:    pinsrb $5, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i8> %shuffle
@@ -739,17 +733,11 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(
 ; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
   ret <16 x i8> %shuffle
@@ -776,17 +764,11 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 3
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %shuffle
@@ -1222,19 +1204,12 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX1OR2:       # BB#0: # %entry
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
-; AVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX512VL:       # BB#0: # %entry
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
-; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
 
@@ -1771,21 +1746,13 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: PR31364:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: PR31364:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: PR31364:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
+; AVX-NEXT:    retq
   %v0 = load i8, i8* %a, align 1
   %vecins = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %v0, i32 0
   %v1 = load i8, i8* %b, align 1
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 4270d3d216c5..3e9e980a1973 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1354,19 +1354,12 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_z6zz:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_z6zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_z6zz:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
   ret <4 x i32> %shuffle
 }
@@ -1683,17 +1676,11 @@ define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_0z23:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_0z23:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_0z23:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
   ret <4 x i32> %shuffle
 }
@@ -1726,17 +1713,11 @@ define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_01z3:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_01z3:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_01z3:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
   ret <4 x i32> %shuffle
 }
@@ -1769,17 +1750,11 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_012z:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_012z:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_012z:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x i32> %shuffle
 }
@@ -1812,17 +1787,11 @@ define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_0zz3:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_0zz3:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_0zz3:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
   ret <4 x i32> %shuffle
 }
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 2421b2b579cf..ac9db62d3c13 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1415,17 +1415,11 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
 ; SSE-NEXT:    pinsrw $1, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_z8zzzzzz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_z8zzzzzz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
   ret <8 x i16> %shuffle
@@ -1438,17 +1432,11 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
 ; SSE-NEXT:    pinsrw $5, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_zzzzz8zz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_zzzzz8zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
   ret <8 x i16> %shuffle
@@ -1461,17 +1449,11 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
 ; SSE-NEXT:    pinsrw $7, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_zuuzuuz8:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_zuuzuuz8:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
   ret <8 x i16> %shuffle
@@ -1484,17 +1466,11 @@ define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
 ; SSE-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_zzBzzzzz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $2, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_zzBzzzzz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $2, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $2, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 3
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %shuffle
@@ -2102,17 +2078,11 @@ define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_0z234567:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0z234567:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_0z234567:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %shuffle
 }
@@ -2134,17 +2104,11 @@ define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_0zzzz5z7:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0zzzz5z7:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_0zzzz5z7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 5, i32 8, i32 7>
   ret <8 x i16> %shuffle
 }
@@ -2166,17 +2130,11 @@ define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_0123456z:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0123456z:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_0123456z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
   ret <8 x i16> %shuffle
 }
diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll
index 2837c28a4841..04d6b3733246 100644
--- a/test/CodeGen/X86/vector-shuffle-masked.ll
+++ b/test/CodeGen/X86/vector-shuffle-masked.ll
@@ -236,3 +236,453 @@ define <8 x i32> @maskz_shuffle_v8i32_23456701(<8 x i32> %a, i8 %mask) {
   %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
+
+define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v8i32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x8 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> %passthru
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v8i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> %passthru
+  ret <8 x i32> %res
+}
+
+define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v8f32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> %passthru
+  ret <8 x float> %res
+}
+
+define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v8f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> %passthru
+  ret <8 x float> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v4i64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x4 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> %passthru
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v4i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> %passthru
+  ret <4 x i64> %res
+}
+
+define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v4f64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x4 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> %passthru
+  ret <4 x double> %res
+}
+
+define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v4f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> %passthru
+  ret <4 x double> %res
+}
+
+define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v8i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x i64> %shuffle to <8 x i32>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle.cast, <8 x i32> %passthru
+  ret <8 x i32> %res
+}
+
+define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v8f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x double> %shuffle to <8 x float>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x float> %shuffle.cast, <8 x float> %passthru
+  ret <8 x float> %res
+}
+
+define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle.cast, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle.cast, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.cast = bitcast <8 x i32> %shuffle to <4 x i64>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle.cast, <4 x i64> %passthru
+  ret <4 x i64> %res
+}
+
+define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.cast = bitcast <8 x float> %shuffle to <4 x double>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x double> %shuffle.cast, <4 x double> %passthru
+  ret <4 x double> %res
+}
+
+define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle.cast, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> %passthru
+  ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 2f5e177badce..bf32e672138c 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -100,73 +100,22 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv2i64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv2i64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv2i64:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv2i64:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # BB#0:
@@ -873,81 +822,24 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv8i16:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv8i16:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv8i16:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv8i16:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # BB#0:
@@ -1071,81 +963,24 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv8i16u:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv8i16u:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv8i16u:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv8i16u:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv8i16u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # BB#0:
@@ -1253,69 +1088,21 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE41-NEXT:    paddb %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv16i8:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv16i8:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv16i8:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv16i8:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # BB#0:
@@ -1419,69 +1206,21 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; SSE41-NEXT:    paddb %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv16i8u:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv16i8u:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv16i8u:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv16i8u:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv16i8u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # BB#0:
diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll
index 4e5fb60fae8b..5d486e794051 100644
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@@ -9,7 +9,6 @@ define void @shift1a(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
 ; X32-LABEL: shift1a:
 ; X32:       # BB#0: # %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-NEXT:    psllq %xmm1, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%eax)
 ; X32-NEXT:    retl
@@ -34,7 +33,6 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
 ; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
 ; X32-NEXT:    movdqa %xmm0, %xmm3
 ; X32-NEXT:    psllq %xmm2, %xmm3
-; X32-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-NEXT:    psllq %xmm1, %xmm0
 ; X32-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
 ; X32-NEXT:    movapd %xmm3, (%eax)
diff --git a/test/DebugInfo/Generic/licm-hoist-debug-loc.ll b/test/DebugInfo/Generic/licm-hoist-debug-loc.ll
new file mode 100644
index 000000000000..c42396d90597
--- /dev/null
+++ b/test/DebugInfo/Generic/licm-hoist-debug-loc.ll
@@ -0,0 +1,75 @@
+; RUN: opt -S -licm %s | FileCheck %s
+;
+; LICM should null out debug locations when it hoists instructions out of a loop.
+;
+; Generated with
+; clang -O0 -S -emit-llvm test.cpp -g -gline-tables-only -o t.ll
+; opt -S -sroa -adce -simplifycfg -reassociate -domtree -loops \
+;     -loop-simplify -lcssa -basicaa -aa -scalar-evolution -loop-rotate t.ll > test.ll
+;
+; void bar(int *);
+; void foo(int k, int p)
+; {
+;    for (int i = 0; i < k; i++) {
+;      bar(&p + 4);
+;    }
+; }
+;
+; We make sure that the instruction that is hoisted into the preheader
+; does not have a debug location.
+; CHECK: for.body.lr.ph:
+; CHECK: getelementptr{{.*}}%p.addr, i64 4{{$}}
+; CHECK: for.body:
+;
+; ModuleID = 't.ll'
+source_filename = "test.c"
+
+; Function Attrs: nounwind sspstrong uwtable
+define void @foo(i32 %k, i32 %p) !dbg !7 {
+entry:
+  %p.addr = alloca i32, align 4
+  store i32 %p, i32* %p.addr, align 4
+  %cmp2 = icmp slt i32 0, %k, !dbg !9
+  br i1 %cmp2, label %for.body.lr.ph, label %for.end, !dbg !9
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %add.ptr = getelementptr inbounds i32, i32* %p.addr, i64 4, !dbg !11
+  call void @bar(i32* %add.ptr), !dbg !11
+  %inc = add nsw i32 %i.03, 1, !dbg !12
+  %cmp = icmp slt i32 %inc, %k, !dbg !9
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !dbg !9, !llvm.loop !14
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end, !dbg !9
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void, !dbg !16
+}
+
+declare void @bar(i32*)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.1 (PS4 clang version 4.50.0.249 7e7cd823 checking)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "D:\test")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 3.9.1 (PS4 clang version 4.50.0.249 7e7cd823 checking)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 4, scope: !10)
+!10 = !DILexicalBlockFile(scope: !7, file: !1, discriminator: 1)
+!11 = !DILocation(line: 5, scope: !7)
+!12 = !DILocation(line: 4, scope: !13)
+!13 = !DILexicalBlockFile(scope: !7, file: !1, discriminator: 2)
+!14 = distinct !{!14, !15}
+!15 = !DILocation(line: 4, scope: !7)
+!16 = !DILocation(line: 7, scope: !7)
diff --git a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index 0667685befc1..ddfd7ca7c364 100644
--- a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -73,7 +73,43 @@ define void @store.v4i64.0001(<4 x i32*> %arg) sanitize_address {
 define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) sanitize_address {
 ; ALL-LABEL: @store.v4f32.variable
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; ALL-NOT: call void @__asan_store
+; STORE: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; STORE: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN0]]:
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP0]])
+; STORE: br label %[[AFTER0]]
+; STORE: <label>:[[AFTER0]]
+
+; STORE: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; STORE: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN1]]:
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP1]])
+; STORE: br label %[[AFTER1]]
+; STORE: <label>:[[AFTER1]]
+
+; STORE: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; STORE: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN2]]:
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP2]])
+; STORE: br label %[[AFTER2]]
+; STORE: <label>:[[AFTER2]]
+
+; STORE: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; STORE: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN3]]:
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP3]])
+; STORE: br label %[[AFTER3]]
+; STORE: <label>:[[AFTER3]]
+
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
   tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
   ret void
 }
@@ -165,7 +201,43 @@ define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) sanitize_address {
 define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) sanitize_address {
 ; ALL-LABEL: @load.v4f32.variable
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; ALL-NOT: call void @__asan_load
+; LOAD: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; LOAD: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN0]]:
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP0]])
+; LOAD: br label %[[AFTER0]]
+; LOAD: <label>:[[AFTER0]]
+
+; LOAD: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; LOAD: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN1]]:
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP1]])
+; LOAD: br label %[[AFTER1]]
+; LOAD: <label>:[[AFTER1]]
+
+; LOAD: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; LOAD: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN2]]:
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP2]])
+; LOAD: br label %[[AFTER2]]
+; LOAD: <label>:[[AFTER2]]
+
+; LOAD: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; LOAD: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN3]]:
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP3]])
+; LOAD: br label %[[AFTER3]]
+; LOAD: <label>:[[AFTER3]]
+
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
   ret <4 x float> %res
 }
diff --git a/test/MC/AsmParser/Inputs/function.x b/test/MC/AsmParser/Inputs/function.x
new file mode 100644
index 000000000000..582bfdfa6271
--- /dev/null
+++ b/test/MC/AsmParser/Inputs/function.x
@@ -0,0 +1,3 @@
+
+FUNCTION = 1
+
diff --git a/test/MC/AsmParser/Inputs/module.x b/test/MC/AsmParser/Inputs/module.x
new file mode 100644
index 000000000000..e93c615ef145
--- /dev/null
+++ b/test/MC/AsmParser/Inputs/module.x
@@ -0,0 +1,3 @@
+
+MODULE = 1
+
diff --git a/test/MC/AsmParser/include.ll b/test/MC/AsmParser/include.ll
new file mode 100644
index 000000000000..390041ff3300
--- /dev/null
+++ b/test/MC/AsmParser/include.ll
@@ -0,0 +1,13 @@
+; RUN: llc -I %p/Inputs -filetype asm -o - %s | FileCheck %s
+
+module asm ".include \22module.x\22"
+
+define arm_aapcscc void @f() {
+entry:
+  call void asm sideeffect ".include \22function.x\22", ""()
+  ret void
+}
+
+; CHECK: MODULE = 1
+; CHECK: FUNCTION = 1
+
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
index 0487e3fdfc23..8c8ce65567a4 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
@@ -231,6 +231,12 @@
 # CHECK: fctid. 2, 3                     
 0xfc 0x40 0x1e 0x5d
 
+# CHECK: fctidu 2, 3
+0xfc 0x40 0x1f 0x5c
+
+# CHECK: fctidu. 2, 3
+0xfc 0x40 0x1f 0x5d
+
 # CHECK: fctidz 2, 3                     
 0xfc 0x40 0x1e 0x5e
 
@@ -249,6 +255,12 @@
 # CHECK: fctiw. 2, 3                     
 0xfc 0x40 0x18 0x1d
 
+# CHECK: fctiwu 2, 3
+0xfc 0x40 0x19 0x1c
+
+# CHECK: fctiwu. 2, 3
+0xfc 0x40 0x19 0x1d
+
 # CHECK: fctiwz 2, 3                     
 0xfc 0x40 0x18 0x1e
 
@@ -309,6 +321,12 @@
 # CHECK: frim. 2, 3                      
 0xfc 0x40 0x1b 0xd1
 
+# CHECK: ftdiv 2, 3, 4
+0xfd 0x03 0x21 0x00
+
+#CHECK: ftsqrt 2, 3
+0xfd,0x00,0x19,0x40
+
 # CHECK: fcmpu 2, 3, 4                   
 0xfd 0x03 0x20 0x00
 
diff --git a/test/MC/PowerPC/ppc64-encoding-fp.s b/test/MC/PowerPC/ppc64-encoding-fp.s
index 48384845715d..0e74840ca167 100644
--- a/test/MC/PowerPC/ppc64-encoding-fp.s
+++ b/test/MC/PowerPC/ppc64-encoding-fp.s
@@ -188,8 +188,14 @@
 # CHECK-BE: frsqrtes. 2, 3                  # encoding: [0xec,0x40,0x18,0x35]
 # CHECK-LE: frsqrtes. 2, 3                  # encoding: [0x35,0x18,0x40,0xec]
             frsqrtes. 2, 3
-# FIXME:    ftdiv 2, 3, 4
-# FIXME:    ftsqrt 2, 3, 4
+
+# CHECK-BE: ftdiv 2, 3, 4                   # encoding: [0xfd,0x03,0x21,0x00]
+# CHECK-LE: ftdiv 2, 3, 4                   # encoding: [0x00,0x21,0x03,0xfd]
+            ftdiv 2, 3, 4
+
+# CHECK-BE: ftsqrt 2, 3                    # encoding: [0xfd,0x00,0x19,0x40]
+# CHECK-LE: ftsqrt 2, 3                    # encoding: [0x40,0x19,0x00,0xfd]
+            ftsqrt 2, 3
 
 # CHECK-BE: fmadd 2, 3, 4, 5                # encoding: [0xfc,0x43,0x29,0x3a]
 # CHECK-LE: fmadd 2, 3, 4, 5                # encoding: [0x3a,0x29,0x43,0xfc]
@@ -255,34 +261,48 @@
 # CHECK-BE: fctid. 2, 3                     # encoding: [0xfc,0x40,0x1e,0x5d]
 # CHECK-LE: fctid. 2, 3                     # encoding: [0x5d,0x1e,0x40,0xfc]
             fctid. 2, 3
+
+# CHECK-BE: fctidu 2, 3                      # encoding: [0xfc,0x40,0x1f,0x5c]
+# CHECK-LE: fctidu 2, 3                      # encoding: [0x5c,0x1f,0x40,0xfc]
+            fctidu 2, 3
+# CHECK-BE: fctidu. 2, 3                     # encoding: [0xfc,0x40,0x1f,0x5d]
+# CHECK-LE: fctidu. 2, 3                     # encoding: [0x5d,0x1f,0x40,0xfc]
+            fctidu. 2, 3
+
 # CHECK-BE: fctidz 2, 3                     # encoding: [0xfc,0x40,0x1e,0x5e]
 # CHECK-LE: fctidz 2, 3                     # encoding: [0x5e,0x1e,0x40,0xfc]
             fctidz 2, 3
 # CHECK-BE: fctidz. 2, 3                    # encoding: [0xfc,0x40,0x1e,0x5f]
 # CHECK-LE: fctidz. 2, 3                    # encoding: [0x5f,0x1e,0x40,0xfc]
             fctidz. 2, 3
-# FIXME:    fctidu 2, 3
-# FIXME:    fctidu. 2, 3
+
 # CHECK-BE: fctiduz 2, 3                    # encoding: [0xfc,0x40,0x1f,0x5e]
 # CHECK-LE: fctiduz 2, 3                    # encoding: [0x5e,0x1f,0x40,0xfc]
             fctiduz 2, 3
 # CHECK-BE: fctiduz. 2, 3                   # encoding: [0xfc,0x40,0x1f,0x5f]
 # CHECK-LE: fctiduz. 2, 3                   # encoding: [0x5f,0x1f,0x40,0xfc]
             fctiduz. 2, 3
+
 # CHECK-BE: fctiw 2, 3                      # encoding: [0xfc,0x40,0x18,0x1c]
 # CHECK-LE: fctiw 2, 3                      # encoding: [0x1c,0x18,0x40,0xfc]
             fctiw 2, 3
 # CHECK-BE: fctiw. 2, 3                     # encoding: [0xfc,0x40,0x18,0x1d]
 # CHECK-LE: fctiw. 2, 3                     # encoding: [0x1d,0x18,0x40,0xfc]
             fctiw. 2, 3
+
+# CHECK-BE: fctiwu 2, 3                      # encoding: [0xfc,0x40,0x19,0x1c]
+# CHECK-LE: fctiwu 2, 3                      # encoding: [0x1c,0x19,0x40,0xfc]
+            fctiwu 2, 3
+# CHECK-BE: fctiwu. 2, 3                     # encoding: [0xfc,0x40,0x19,0x1d]
+# CHECK-LE: fctiwu. 2, 3                     # encoding: [0x1d,0x19,0x40,0xfc]
+            fctiwu. 2, 3
+
 # CHECK-BE: fctiwz 2, 3                     # encoding: [0xfc,0x40,0x18,0x1e]
 # CHECK-LE: fctiwz 2, 3                     # encoding: [0x1e,0x18,0x40,0xfc]
             fctiwz 2, 3
 # CHECK-BE: fctiwz. 2, 3                    # encoding: [0xfc,0x40,0x18,0x1f]
 # CHECK-LE: fctiwz. 2, 3                    # encoding: [0x1f,0x18,0x40,0xfc]
             fctiwz. 2, 3
-# FIXME:    fctiwu 2, 3
-# FIXME:    fctiwu. 2, 3
 # CHECK-BE: fctiwuz 2, 3                    # encoding: [0xfc,0x40,0x19,0x1e]
 # CHECK-LE: fctiwuz 2, 3                    # encoding: [0x1e,0x19,0x40,0xfc]
             fctiwuz 2, 3
diff --git a/test/ThinLTO/X86/Inputs/deadstrip.ll b/test/ThinLTO/X86/Inputs/deadstrip.ll
new file mode 100644
index 000000000000..a9161a31b249
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/deadstrip.ll
@@ -0,0 +1,22 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @dead_func()
+
+; Called from a @dead_func() in the other file, should not be imported there
+; Ensure the cycle formed by calling @dead_func doesn't prevent stripping.
+define void @baz() {
+    call void @dead_func()
+    ret void
+}
+
+; Called via llvm.global_ctors, should be detected as live via the
+; marking of llvm.global_ctors as a live root in the index.
+define void @boo() {
+  ret void
+}
+
+define void @another_dead_func() {
+    call void @dead_func()
+    ret void
+}
diff --git a/test/ThinLTO/X86/Inputs/lazyload_metadata.ll b/test/ThinLTO/X86/Inputs/lazyload_metadata.ll
new file mode 100644
index 000000000000..f51a08a1a3fc
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/lazyload_metadata.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @globalfunc1()
+
+
+define i32 @main() {
+	call void @globalfunc1()
+	ret i32 0
+}
+
+
diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
new file mode 100644
index 000000000000..6f1cbfe59693
--- /dev/null
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -0,0 +1,109 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/deadstrip.ll -o %t2.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t.index.bc %t1.bc %t2.bc
+
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t1.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t1.bc - -o - | llvm-dis -o - | FileCheck %s
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t2.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t2.bc - -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK2
+
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=run %t1.bc %t2.bc
+; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s --check-prefix=CHECK-NM
+
+; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN:   -r %t1.bc,_main,plx \
+; RUN:   -r %t1.bc,_bar,pl \
+; RUN:   -r %t1.bc,_dead_func,pl \
+; RUN:   -r %t1.bc,_baz,l \
+; RUN:   -r %t1.bc,_boo,l \
+; RUN:   -r %t2.bc,_baz,pl \
+; RUN:   -r %t2.bc,_boo,pl \
+; RUN:   -r %t2.bc,_dead_func,pl \
+; RUN:   -r %t2.bc,_another_dead_func,pl
+; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s
+; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2
+; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM
+
+; Dead-stripping on the index allows to internalize these,
+; and limit the import of @baz thanks to early pruning.
+; CHECK-NOT: available_externally {{.*}} @baz()
+; CHECK: @llvm.global_ctors =
+; CHECK: define internal void @_GLOBAL__I_a()
+; CHECK: define internal void @bar() {
+; CHECK: define internal void @bar_internal()
+; CHECK: define internal void @dead_func() {
+; CHECK-NOT: available_externally {{.*}} @baz()
+
+; Make sure we didn't internalize @boo, which is reachable via
+; llvm.global_ctors
+; CHECK2: define void @boo()
+; We should have eventually revoved @baz since it was internalized and unused
+; CHECK2-NM-NOT: _baz
+
+; The final binary should not contain any of the dead functions,
+; only main is expected because bar is expected to be inlined and stripped out.
+; CHECK-NM-NOT: bar
+; CHECK-NM-NOT: dead
+; CHECK-NM: T _main
+; CHECK-NM-NOT: bar
+; CHECK-NM-NOT: dead
+
+; Next test the case where Inputs/deadstrip.ll does not get a module index,
+; which will cause it to be handled by regular LTO in the new LTO API.
+; In that case there are uses of @dead_func in the regular LTO partition
+; and it shouldn't be internalized.
+; RUN: opt %p/Inputs/deadstrip.ll -o %t3.bc
+; RUN: llvm-lto2 %t1.bc %t3.bc -o %t4.out -save-temps \
+; RUN:   -r %t1.bc,_main,plx \
+; RUN:   -r %t1.bc,_bar,pl \
+; RUN:   -r %t1.bc,_dead_func,pl \
+; RUN:   -r %t1.bc,_baz,l \
+; RUN:   -r %t1.bc,_boo,l \
+; RUN:   -r %t3.bc,_baz,pl \
+; RUN:   -r %t3.bc,_boo,pl \
+; RUN:   -r %t3.bc,_dead_func,pl \
+; RUN:   -r %t3.bc,_another_dead_func,pl
+; RUN: llvm-dis < %t4.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK-NOTDEAD
+; RUN: llvm-nm %t4.out.0 | FileCheck %s --check-prefix=CHECK-NM-NOTDEAD
+
+; We can't internalize @dead_func because of the use in the regular LTO
+; partition.
+; CHECK-NOTDEAD: define void @dead_func()
+; We also can't eliminate @baz because it is in the regular LTO partition
+; and called from @dead_func.
+; CHECK-NM-NOTDEAD: T _baz
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
+declare void @baz()
+
+declare void @boo()
+
+define internal void @_GLOBAL__I_a() #1 section "__TEXT,__StaticInit,regular,pure_instructions" {
+entry:
+    call void @boo()
+    ret void
+}
+
+define void @bar() {
+    ret void
+}
+
+define internal void @bar_internal() {
+    ret void
+}
+
+define void @dead_func() {
+    call void @bar()
+    call void @baz()
+    call void @bar_internal()
+    ret void
+}
+
+define void @main() {
+    call void @bar()
+    call void @bar_internal()
+    ret void
+}
diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll
new file mode 100644
index 000000000000..3c4345831aa3
--- /dev/null
+++ b/test/ThinLTO/X86/lazyload_metadata.ll
@@ -0,0 +1,54 @@
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary %s -o %t.bc -bitcode-mdindex-threshold=0
+; RUN: opt -module-summary %p/Inputs/lazyload_metadata.ll -o %t2.bc -bitcode-mdindex-threshold=0
+; RUN: llvm-lto -thinlto-action=thinlink -o %t3.bc %t.bc %t2.bc
+; REQUIRES: asserts
+
+; Check that importing @globalfunc1 does not trigger loading all the global
+; metadata for @globalfunc2 and @globalfunc3
+
+; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
+; RUN:          -o /dev/null -stats \
+; RUN:  2>&1 | FileCheck %s -check-prefix=LAZY
+; LAZY: 49 bitcode-reader  - Number of Metadata records loaded
+; LAZY: 1 bitcode-reader  - Number of MDStrings loaded
+
+; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
+; RUN:          -o /dev/null -disable-ondemand-mds-loading -stats \
+; RUN:  2>&1 | FileCheck %s -check-prefix=NOTLAZY
+; NOTLAZY: 58 bitcode-reader  - Number of Metadata records loaded
+; NOTLAZY: 8 bitcode-reader  - Number of MDStrings loaded
+
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define void @globalfunc1(i32 %arg) {
+  %tmp = add i32 %arg, 0, !metadata !2
+  ret void
+}
+
+; We need two functions here that will both reference the same metadata.
+; This is to force the metadata to be emitted in the global metadata block and
+; not in the function specific metadata.
+; These function are not imported and so we don't want to load their metadata.
+
+define void @globalfunc2(i32 %arg) {
+  %tmp = add i32 %arg, 0, !metadata !1
+  ret void
+}
+
+define void @globalfunc3(i32 %arg) {
+  %tmp = add i32 %arg, 0, !metadata !1
+  ret void
+}
+
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"Hello World"}
+!3 = !{!"3"}
+!4 = !{!"4"}
+!5 = !{!"5"}
+!6 = !{!"6"}
+!7 = !{!"7"}
+!8 = !{!"8"}
+!9 = !{!"9"}
diff --git a/test/Transforms/GVN/PRE/phi-translate.ll b/test/Transforms/GVN/PRE/phi-translate.ll
index 42335486910c..1f6c7c8d33ea 100644
--- a/test/Transforms/GVN/PRE/phi-translate.ll
+++ b/test/Transforms/GVN/PRE/phi-translate.ll
@@ -4,18 +4,17 @@ target datalayout = "e-p:64:64:64"
 
 ; CHECK-LABEL: @foo(
 ; CHECK: entry.end_crit_edge:
-; CHECK:   %j.phi.trans.insert = sext i32 %x to i64, !dbg [[J_LOC:![0-9]+]]
-; CHECK:   %q.phi.trans.insert = getelementptr {{.*}}, !dbg [[Q_LOC:![0-9]+]]
-; CHECK:   %n.pre = load i32, i32* %q.phi.trans.insert, !dbg [[N_LOC:![0-9]+]]
+; CHECK: %[[INDEX:[a-z0-9.]+]] = sext i32 %x to i64{{$}}
+; CHECK: %[[ADDRESS:[a-z0-9.]+]] = getelementptr [100 x i32], [100 x i32]* @G, i64 0, i64 %[[INDEX]]{{$}}
+; CHECK:   %n.pre = load i32, i32* %[[ADDRESS]]{{$}}
+; CHECK: br label %end
 ; CHECK: then:
 ; CHECK:   store i32 %z
 ; CHECK: end:
-; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC]]
+; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC:![0-9]+]]
 ; CHECK:   ret i32 %n
 
-; CHECK-DAG: [[J_LOC]] = !DILocation(line: 45, column: 1, scope: !{{.*}})
-; CHECK-DAG: [[Q_LOC]] = !DILocation(line: 46, column: 1, scope: !{{.*}})
-; CHECK-DAG: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}})
+; CHECK: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}})
 
 @G = external global [100 x i32]
 define i32 @foo(i32 %x, i32 %z) !dbg !6 {
diff --git a/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
index 3c38e7890620..a228968f25bc 100644
--- a/test/Transforms/InstCombine/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
@@ -599,3 +599,37 @@ define i1 @test_constant_class_snan_test_pinf_f64() nounwind {
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 512)
   ret i1 %val
 }
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cos
+; --------------------------------------------------------------------
+declare float @llvm.amdgcn.cos.f32(float) nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: @cos_fneg_f32(
+; CHECK: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fneg_f32(float %x) {
+  %x.fneg = fsub float -0.0, %x
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fneg)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_fneg_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fsub float -0.0, %x.fabs
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
+  ret float %cos
+}
diff --git a/test/Transforms/InstCombine/cos-intrinsic.ll b/test/Transforms/InstCombine/cos-intrinsic.ll
index b4d07cf8047b..24b605e9c8c8 100644
--- a/test/Transforms/InstCombine/cos-intrinsic.ll
+++ b/test/Transforms/InstCombine/cos-intrinsic.ll
@@ -3,6 +3,10 @@
 
 declare double    @llvm.cos.f64(double %Val)
 declare float     @llvm.cos.f32(float %Val)
+declare <2 x float> @llvm.cos.v2f32(<2 x float> %Val)
+
+declare float @llvm.fabs.f32(float %Val)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> %Val)
 
 ; Function Attrs: nounwind readnone
 define double @test1() {
@@ -24,3 +28,54 @@ define float @test2(float %d) {
 ; CHECK-NEXT: %fsum
 ; CHECK: ret float %fsum
 }
+
+; CHECK-LABEL: @cos_fneg_f32(
+; CHECK: %cos = call float @llvm.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fneg_f32(float %x) {
+  %x.fneg = fsub float -0.0, %x
+  %cos = call float @llvm.cos.f32(float %x.fneg)
+  ret float %cos
+}
+
+; FIXME: m_FNeg() doesn't handle vectors
+; CHECK-LABEL: @cos_fneg_v2f32(
+; CHECK: %x.fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %x
+; CHECK-NEXT: %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fneg)
+; CHECK-NEXT: ret <2 x float> %cos
+define <2 x float> @cos_fneg_v2f32(<2 x float> %x) {
+  %x.fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fneg)
+  ret <2 x float> %cos
+}
+
+; CHECK-LABEL: @cos_fabs_f32(
+; CHECK-NEXT: %cos = call float @llvm.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %cos = call float @llvm.cos.f32(float %x.fabs)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_f32(
+; CHECK: %cos = call float @llvm.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_fneg_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fsub float -0.0, %x.fabs
+  %cos = call float @llvm.cos.f32(float %x.fabs.fneg)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_v2f32(
+; CHECK: %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+; CHECK-NEXT: %x.fabs.fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %x.fabs
+; CHECK-NEXT: %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fabs.fneg)
+; CHECK-NEXT: ret <2 x float> %cos
+define <2 x float> @cos_fabs_fneg_v2f32(<2 x float> %x) {
+  %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+  %x.fabs.fneg = fsub <2 x float> <float -0.0, float -0.0>, %x.fabs
+  %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fabs.fneg)
+  ret <2 x float> %cos
+}
diff --git a/test/Transforms/InstCombine/icmp-shl-nsw.ll b/test/Transforms/InstCombine/icmp-shl-nsw.ll
new file mode 100644
index 000000000000..896a45625b9f
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-shl-nsw.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; If the (shl x, C) preserved the sign and this is a sign test,
+; compare the LHS operand instead
+
+define i1 @icmp_shl_nsw_sgt(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i32 %x, 21
+  %cmp = icmp sgt i32 %shl, 0
+  ret i1 %cmp
+}
+
+define i1 @icmp_shl_nsw_sge0(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sge0(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i32 %x, 21
+  %cmp = icmp sge i32 %shl, 0
+  ret i1 %cmp
+}
+
+define i1 @icmp_shl_nsw_sge1(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sge1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i32 %x, 21
+  %cmp = icmp sge i32 %shl, 1
+  ret i1 %cmp
+}
+
+define <2 x i1> @icmp_shl_nsw_sge1_vec(<2 x i32> %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sge1_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> %x, zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %shl = shl nsw <2 x i32> %x, <i32 21, i32 21>
+  %cmp = icmp sge <2 x i32> %shl, <i32 1, i32 1>
+  ret <2 x i1> %cmp
+}
+
+; Checks for icmp (eq|ne) (shl x, C), 0
+
+define i1 @icmp_shl_nsw_eq(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_eq(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %mul = shl nsw i32 %x, 5
+  %cmp = icmp eq i32 %mul, 0
+  ret i1 %cmp
+}
+
+define <2 x i1> @icmp_shl_nsw_eq_vec(<2 x i32> %x) {
+; CHECK-LABEL: @icmp_shl_nsw_eq_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> %x, zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %mul = shl nsw <2 x i32> %x, <i32 5, i32 5>
+  %cmp = icmp eq <2 x i32> %mul, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+; icmp sgt with shl nsw with a constant compare operand and constant
+; shift amount can always be reduced to icmp sgt alone.
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sgt1(i8 %x) {
+; CHECK-LABEL: @icmp_sgt1(
+; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 127
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[SHL_MASK]], 64
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -128
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt2(i8 %x) {
+; CHECK-LABEL: @icmp_sgt2(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -127
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -127
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt3(i8 %x) {
+; CHECK-LABEL: @icmp_sgt3(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -16
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -16
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt4(i8 %x) {
+; CHECK-LABEL: @icmp_sgt4(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -2
+  ret i1 %cmp
+}
+
+; x >s -1 is a sign bit test.
+; x >s 0 is a sign bit test.
+
+define i1 @icmp_sgt5(i8 %x) {
+; CHECK-LABEL: @icmp_sgt5(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 1
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt6(i8 %x) {
+; CHECK-LABEL: @icmp_sgt6(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 16
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 16
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt7(i8 %x) {
+; CHECK-LABEL: @icmp_sgt7(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 124
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 124
+  ret i1 %cmp
+}
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sgt8(i8 %x) {
+; CHECK-LABEL: @icmp_sgt8(
+; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 127
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL_MASK]], 63
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 125
+  ret i1 %cmp
+}
+
+; Compares with 126 and 127 are recognized as always false.
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sgt9(i8 %x) {
+; CHECK-LABEL: @icmp_sgt9(
+; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL_MASK]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sgt i8 %shl, -128
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt10(i8 %x) {
+; CHECK-LABEL: @icmp_sgt10(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -127
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sgt i8 %shl, -127
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt11(i8 %x) {
+; CHECK-LABEL: @icmp_sgt11(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sgt i8 %shl, -2
+  ret i1 %cmp
+}
+
+; Splat vector version should fold the same way.
+
+define <2 x i1> @icmp_sgt11_vec(<2 x i8> %x) {
+; CHECK-LABEL: @icmp_sgt11_vec(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw <2 x i8> %x, <i8 7, i8 7>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> [[SHL]], <i8 -2, i8 -2>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %shl = shl nsw <2 x i8> %x, <i8 7, i8 7>
+  %cmp = icmp sgt <2 x i8> %shl, <i8 -2, i8 -2>
+  ret <2 x i1> %cmp
+}
+
+; Known bits analysis returns false for compares with >=0.
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 099aaca5f7d6..32fe050bf83f 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1243,69 +1243,6 @@ define i1 @icmp_shl24(i32 %x) {
   ret i1 %cmp
 }
 
-; If the (shl x, C) preserved the sign and this is a sign test,
-; compare the LHS operand instead
-define i1 @icmp_shl_nsw_sgt(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sgt(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %shl = shl nsw i32 %x, 21
-  %cmp = icmp sgt i32 %shl, 0
-  ret i1 %cmp
-}
-
-define i1 @icmp_shl_nsw_sge0(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sge0(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %shl = shl nsw i32 %x, 21
-  %cmp = icmp sge i32 %shl, 0
-  ret i1 %cmp
-}
-
-define i1 @icmp_shl_nsw_sge1(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sge1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %shl = shl nsw i32 %x, 21
-  %cmp = icmp sge i32 %shl, 1
-  ret i1 %cmp
-}
-
-define <2 x i1> @icmp_shl_nsw_sge1_vec(<2 x i32> %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sge1_vec(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> %x, zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
-;
-  %shl = shl nsw <2 x i32> %x, <i32 21, i32 21>
-  %cmp = icmp sge <2 x i32> %shl, <i32 1, i32 1>
-  ret <2 x i1> %cmp
-}
-
-; Checks for icmp (eq|ne) (shl x, C), 0
-define i1 @icmp_shl_nsw_eq(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_eq(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 %x, 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %mul = shl nsw i32 %x, 5
-  %cmp = icmp eq i32 %mul, 0
-  ret i1 %cmp
-}
-
-define <2 x i1> @icmp_shl_nsw_eq_vec(<2 x i32> %x) {
-; CHECK-LABEL: @icmp_shl_nsw_eq_vec(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> %x, zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
-;
-  %mul = shl nsw <2 x i32> %x, <i32 5, i32 5>
-  %cmp = icmp eq <2 x i32> %mul, zeroinitializer
-  ret <2 x i1> %cmp
-}
-
 define i1 @icmp_shl_eq(i32 %x) {
 ; CHECK-LABEL: @icmp_shl_eq(
 ; CHECK-NEXT:    [[MUL_MASK:%.*]] = and i32 %x, 134217727
diff --git a/test/Transforms/InstSimplify/select.ll b/test/Transforms/InstSimplify/select.ll
index 6ddaaba20461..1acb5c469d37 100644
--- a/test/Transforms/InstSimplify/select.ll
+++ b/test/Transforms/InstSimplify/select.ll
@@ -402,3 +402,31 @@ define i32* @select_icmp_pointers(i32* %x, i32* %y) {
   ret i32* %sel
 }
 
+; FIXME: If the condition is known, we don't need to select.
+
+declare void @llvm.assume(i1)
+
+define i8 @assume_sel_cond(i1 %cond, i8 %x, i8 %y) {
+; CHECK-LABEL: @assume_sel_cond(
+; CHECK-NEXT:    call void @llvm.assume(i1 %cond)
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 %cond, i8 %x, i8 %y
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  call void @llvm.assume(i1 %cond)
+  %sel = select i1 %cond, i8 %x, i8 %y
+  ret i8 %sel
+}
+
+define i8 @do_not_assume_sel_cond(i1 %cond, i8 %x, i8 %y) {
+; CHECK-LABEL: @do_not_assume_sel_cond(
+; CHECK-NEXT:    [[NOTCOND:%.*]] = icmp eq i1 %cond, false
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTCOND]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 %cond, i8 %x, i8 %y
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %notcond = icmp eq i1 %cond, false
+  call void @llvm.assume(i1 %notcond)
+  %sel = select i1 %cond, i8 %x, i8 %y
+  ret i8 %sel
+}
+
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar_promote.ll
index 91cdbdbc2269..dc5151be8a82 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar_promote.ll
@@ -186,6 +186,198 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK-NEXT:  store i32 %[[LCSSAPHI]], i32* %gi, align 4, !tbaa !0
 }
 
+declare i32 @opaque(i32) argmemonly
+declare void @capture(i32*)
+
+; We can promote even if opaque may throw.
+define i32 @test7() {
+; CHECK-LABEL: @test7(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Make sure we don't promote if the store is really control-flow dependent.
+define i32 @test7bad() {
+; CHECK-LABEL: @test7bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: store i32 %x2, i32* %local
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  %cmp = icmp eq i32 %x2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  store i32 %x2, i32* %local
+  br label %else
+
+else:
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Even if neither the load nor the store or guaranteed to execute because
+; opaque() may throw, we can still promote - the load not being guaranteed
+; doesn't block us, because %local is always dereferenceable.
+define i32 @test8() {
+; CHECK-LABEL: @test8(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %throwaway = call i32 @opaque(i32 %j)
+  %x = load i32, i32* %local  
+  %x2 = call i32 @opaque(i32 %x)
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+
+; If the store is "guaranteed modulo exceptions", and the load depends on
+; control flow, we can only promote if the pointer is otherwise known to be
+; dereferenceable
+define i32 @test9() {
+; CHECK-LABEL: @test9(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %else ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %local
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+define i32 @test9bad(i32 %i) {
+; CHECK-LABEL: @test9bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: %notderef = getelementptr
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: load i32, i32* %notderef
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %notderef
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  %notderef = getelementptr i32, i32* %local, i32 %i
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %notderef
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %notderef
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %notderef
+  ret i32 %ret
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
new file mode 100644
index 000000000000..645f3360543c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -0,0 +1,54 @@
+; This test checks that the given loop still beneficial for vecotization
+; even if it contains scalarized load (gather on AVX2)
+;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+  %idxprom = sext i32 %i to i64
+  %idxprom5 = sext i32 %j to i64
+  br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add7
+
+  for.body:                                         ; preds = %for.body, %entry
+  ; the loop gets vectorized
+  ; first consecutive load as vector load
+  ; CHECK: %wide.load = load <8 x i32>
+  ; second strided load scalarized
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1
+  %mul = mul nsw i32 %1, %0
+  %add = add i32 %sum.015, 4
+  %add7 = add i32 %add, %mul
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
new file mode 100644
index 000000000000..d2a3ef81a3a4
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
@@ -0,0 +1,10 @@
+---
+GlobalValueMap:
+  42:
+    - TypeTests: [123]
+TypeIdMap:
+  typeid1:
+    TTRes:
+      Kind: Unsat
+      SizeBitWidth: 0
+...
diff --git a/test/Transforms/LowerTypeTests/export-nothing.ll b/test/Transforms/LowerTypeTests/export-nothing.ll
new file mode 100644
index 000000000000..9ab41b5f6cb6
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-nothing.ll
@@ -0,0 +1,7 @@
+; RUN: opt -lowertypetests -lowertypetests-summary-action=export -lowertypetests-write-summary=%t -o /dev/null %s
+; RUN: FileCheck %s < %t
+
+; CHECK: ---
+; CHECK-NEXT: GlobalValueMap:
+; CHECK-NEXT: TypeIdMap:
+; CHECK-NEXT: ...
diff --git a/test/Transforms/LowerTypeTests/function-disjoint.ll b/test/Transforms/LowerTypeTests/function-disjoint.ll
index 0f9d4a32d150..f39c8eec47c5 100644
--- a/test/Transforms/LowerTypeTests/function-disjoint.ll
+++ b/test/Transforms/LowerTypeTests/function-disjoint.ll
@@ -30,10 +30,10 @@ declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
 
 define i1 @foo(i8* %p) {
   ; X64: icmp eq i64 {{.*}}, ptrtoint (void ()* @[[JT0]] to i64)
-  ; WASM32: icmp eq i64 {{.*}}, 1
+  ; WASM32: icmp eq i64 {{.*}}, ptrtoint (i8* getelementptr (i8, i8* null, i64 1) to i64)
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
   ; X64: icmp eq i64 {{.*}}, ptrtoint (void ()* @[[JT1]] to i64)
-  ; WASM32: icmp eq i64 {{.*}}, 2
+  ; WASM32: icmp eq i64 {{.*}}, mul (i64 ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64), i64 2)
   %y = call i1 @llvm.type.test(i8* %p, metadata !"typeid2")
   %z = add i1 %x, %y
   ret i1 %z
@@ -46,4 +46,4 @@ define i1 @foo(i8* %p) {
 ; X64:   call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(void ()* @g.cfi)
 
 ; WASM32: ![[I0]] = !{i64 1}
-; WASM32: ![[I1]] = !{i64 2}
-\ No newline at end of file
+; WASM32: ![[I1]] = !{i64 2}
diff --git a/test/Transforms/LowerTypeTests/function-ext.ll b/test/Transforms/LowerTypeTests/function-ext.ll
index 7ed4330a8b58..8318cad89d54 100644
--- a/test/Transforms/LowerTypeTests/function-ext.ll
+++ b/test/Transforms/LowerTypeTests/function-ext.ll
@@ -11,8 +11,7 @@ declare !type !0 void @foo()
 
 define i1 @bar(i8* %ptr) {
   ; X64: icmp eq i64 {{.*}}, ptrtoint (void ()* @[[JT:.*]] to i64)
-  ; WASM32: sub i64 {{.*}}, 0
-  ; WASM32: icmp ult i64 {{.*}}, 1
+  ; WASM32: ret i1 false
   %p = call i1 @llvm.type.test(i8* %ptr, metadata !"void")
   ret i1 %p
 }
diff --git a/test/Transforms/LowerTypeTests/function.ll b/test/Transforms/LowerTypeTests/function.ll
index 287825829150..9abea8f854c1 100644
--- a/test/Transforms/LowerTypeTests/function.ll
+++ b/test/Transforms/LowerTypeTests/function.ll
@@ -42,7 +42,7 @@ declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
 
 define i1 @foo(i8* %p) {
   ; NATIVE: sub i64 {{.*}}, ptrtoint (void ()* @[[JT]] to i64)
-  ; WASM32: sub i64 {{.*}}, 1
+  ; WASM32: sub i64 {{.*}}, ptrtoint (i8* getelementptr (i8, i8* null, i64 1) to i64)
   ; WASM32: icmp ult i64 {{.*}}, 2
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
   ret i1 %x
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
new file mode 100644
index 000000000000..7ca70f2636fd
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -0,0 +1,23 @@
+; Test that we correctly import an unsat resolution for type identifier "typeid1".
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%S/Inputs/import-unsat.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:      GlobalValueMap:
+; SUMMARY-NEXT:   42:
+; SUMMARY-NEXT:     - TypeTests:
+; SUMMARY-NEXT:         - 123
+; SUMMARY-NEXT: TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeBitWidth:    0
+
+target datalayout = "e-p:32:32"
+
+declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+define i1 @foo(i8* %p) {
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
+  ; CHECK: ret i1 false
+  ret i1 %x
+}
diff --git a/test/Transforms/LowerTypeTests/simple.ll b/test/Transforms/LowerTypeTests/simple.ll
index 0628951868cd..91b94184420b 100644
--- a/test/Transforms/LowerTypeTests/simple.ll
+++ b/test/Transforms/LowerTypeTests/simple.ll
@@ -92,7 +92,7 @@ define i1 @bar(i32* %p) {
   ; CHECK: [[S0:%[^ ]*]] = bitcast i32* [[B0]] to i8*
   %pi8 = bitcast i32* %p to i8*
   ; CHECK: [[S1:%[^ ]*]] = ptrtoint i8* [[S0]] to i32
-  ; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], add (i32 ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32), i32 4)
+  ; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], ptrtoint (i8* getelementptr (i8, i8* bitcast ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i8*), i32 4) to i32)
   ; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 8
   ; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 24
   ; CHECK: [[S5:%[^ ]*]] = or i32 [[S3]], [[S4]]
diff --git a/test/Transforms/LowerTypeTests/single-offset.ll b/test/Transforms/LowerTypeTests/single-offset.ll
index 6dd37984df9c..8d2c0e831cda 100644
--- a/test/Transforms/LowerTypeTests/single-offset.ll
+++ b/test/Transforms/LowerTypeTests/single-offset.ll
@@ -24,7 +24,7 @@ define i1 @foo(i8* %p) {
 ; CHECK: @bar(i8* [[B0:%[^ ]*]])
 define i1 @bar(i8* %p) {
   ; CHECK: [[S0:%[^ ]*]] = ptrtoint i8* [[B0]] to i32
-  ; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]], add (i32 ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32), i32 4)
+  ; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]],  ptrtoint (i8* getelementptr (i8, i8* bitcast ({ i32, [0 x i8], i32 }* [[G]] to i8*), i32 4) to i32)
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid3")
   ; CHECK: ret i1 [[S1]]
   ret i1 %x
diff --git a/test/Transforms/LowerTypeTests/unsat.ll b/test/Transforms/LowerTypeTests/unsat.ll
index 5bafc9e8c40d..e797baf59bc1 100644
--- a/test/Transforms/LowerTypeTests/unsat.ll
+++ b/test/Transforms/LowerTypeTests/unsat.ll
@@ -1,5 +1,4 @@
-; FIXME: We should not require -O2 to simplify this to return false.
-; RUN: opt -S -lowertypetests -O2 < %s | FileCheck %s
+; RUN: opt -S -lowertypetests < %s | FileCheck %s
 
 target datalayout = "e-p:32:32"