116 files changed, 6353 insertions, 1136 deletions
diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
new file mode 100644
index 0000000000000..770521b752805
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
@@ -0,0 +1,22 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; Test for bug in misched memory dependency calculation.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: misched_bug:BB#0 entry
+; CHECK: SU(2):   %vreg2<def> = LDRWui %vreg0, 1; mem:LD4[%ptr1_plus1] GPR32:%vreg2 GPR64common:%vreg0
+; CHECK:   Successors:
+; CHECK-NEXT:    val SU(5): Latency=4 Reg=%vreg2
+; CHECK-NEXT:    ch  SU(4): Latency=0
+; CHECK: SU(4):   STRWui %WZR, %vreg1, 0; mem:ST4[%ptr2] GPR64common:%vreg1
+; CHECK: SU(5):   %W0<def> = COPY %vreg2; GPR32:%vreg2
+; CHECK: ** ScheduleDAGMI::schedule picking next node
+define i32 @misched_bug(i32* %ptr1, i32* %ptr2) {
+entry:
+  %ptr1_plus1 = getelementptr inbounds i32, i32* %ptr1, i64 1
+  %val1 = load i32, i32* %ptr1_plus1, align 4
+  store i32 0, i32* %ptr1, align 4
+  store i32 0, i32* %ptr2, align 4
+  ret i32 %val1
+}
diff --git a/test/CodeGen/AArch64/branch-folder-merge-mmos.ll b/test/CodeGen/AArch64/branch-folder-merge-mmos.ll
new file mode 100644
index 0000000000000..3f9c0239fe41b
--- /dev/null
+++ b/test/CodeGen/AArch64/branch-folder-merge-mmos.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=aarch64 -mtriple=aarch64-none-linux-gnu -stop-after branch-folder -o /dev/null < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: norecurse nounwind
+define void @foo(i32 %a, i32 %b, float* nocapture %foo_arr) #0 {
+; CHECK: (load 4 from %ir.arrayidx1.{{i[1-2]}}), (load 4 from %ir.arrayidx1.{{i[1-2]}})
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %0 = load float, float* %foo_arr, align 4
+  %arrayidx1.i1 = getelementptr inbounds float, float* %foo_arr, i64 1
+  %1 = load float, float* %arrayidx1.i1, align 4
+  %sub.i = fsub float %0, %1
+  store float %sub.i, float* %foo_arr, align 4
+  br label %if.end3
+
+if.end:                                           ; preds = %entry
+  %cmp1 = icmp sgt i32 %b, 0
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  %2 = load float, float* %foo_arr, align 4
+  %arrayidx1.i2 = getelementptr inbounds float, float* %foo_arr, i64 1
+  %3 = load float, float* %arrayidx1.i2, align 4
+  %sub.i3 = fsub float %2, %3
+  store float %sub.i3, float* %foo_arr, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end, %if.then
+  ret void
+}
diff --git a/test/CodeGen/AArch64/machine-combiner.ll b/test/CodeGen/AArch64/machine-combiner.ll
new file mode 100644
index 0000000000000..56a742fd6c3a9
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-combiner.ll
@@ -0,0 +1,258 @@
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math < %s | FileCheck %s 
+
+; Verify that the first two adds are independent regardless of how the inputs are
+; commuted. The destination registers are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL:   reassociate_adds1:
+; CHECK:         fadd  s0, s0, s1
+; CHECK-NEXT:    fadd  s1, s2, s3
+; CHECK-NEXT:    fadd  s0, s0, s1
+; CHECK-NEXT:    ret
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL:   reassociate_adds2:
+; CHECK:         fadd  s0, s0, s1
+; CHECK-NEXT:    fadd  s1, s2, s3
+; CHECK-NEXT:    fadd  s0, s0, s1
+; CHECK-NEXT:    ret
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL:   reassociate_adds3:
+; CHECK:         s0, s0, s1
+; CHECK-NEXT:    s1, s2, s3
+; CHECK-NEXT:    s0, s0, s1
+; CHECK-NEXT:    ret
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL:   reassociate_adds4:
+; CHECK:         s0, s0, s1
+; CHECK-NEXT:    s1, s2, s3
+; CHECK-NEXT:    s0, s0, s1
+; CHECK-NEXT:    ret
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL:   reassociate_adds5:
+; CHECK:         fadd  s0, s0, s1
+; CHECK-NEXT:    fadd  s1, s2, s3
+; CHECK-NEXT:    fadd  s0, s0, s1
+; CHECK-NEXT:    fadd  s1, s4, s5
+; CHECK-NEXT:    fadd  s1, s1, s6
+; CHECK-NEXT:    fadd  s0, s0, s1
+; CHECK-NEXT:    fadd  s0, s0, s7
+; CHECK-NEXT:    ret
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  %t3 = fadd float %t2, %x4
+  %t4 = fadd float %t3, %x5
+  %t5 = fadd float %t4, %x6
+  %t6 = fadd float %t5, %x7
+  ret float %t6
+}
+
+; Verify that we only need two associative operations to reassociate the operands.
+; Also, we should reassociate such that the result of the high latency division
+; is used by the final 'add' rather than reassociating the %x3 operand with the
+; division. The latter reassociation would not improve anything.
+
+define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL:   reassociate_adds6:
+; CHECK:         fdiv  s0, s0, s1
+; CHECK-NEXT:    fadd  s1, s2, s3
+; CHECK-NEXT:    fadd  s0, s0, s1
+; CHECK-NEXT:    ret
+  %t0 = fdiv float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+; Verify that scalar single-precision multiplies are reassociated.
+
+define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL:   reassociate_muls1:
+; CHECK:         fdiv  s0, s0, s1
+; CHECK-NEXT:    fmul  s1, s2, s3
+; CHECK-NEXT:    fmul  s0, s0, s1
+; CHECK-NEXT:    ret
+  %t0 = fdiv float %x0, %x1
+  %t1 = fmul float %x2, %t0
+  %t2 = fmul float %x3, %t1
+  ret float %t2
+}
+
+; Verify that scalar double-precision adds are reassociated.
+
+define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) {
+; CHECK-LABEL:   reassociate_adds_double:
+; CHECK:         fdiv  d0, d0, d1
+; CHECK-NEXT:    fadd  d1, d2, d3
+; CHECK-NEXT:    fadd  d0, d0, d1
+; CHECK-NEXT:    ret
+  %t0 = fdiv double %x0, %x1
+  %t1 = fadd double %x2, %t0
+  %t2 = fadd double %x3, %t1
+  ret double %t2
+}
+
+; Verify that scalar double-precision multiplies are reassociated.
+
+define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) {
+; CHECK-LABEL:   reassociate_muls_double:
+; CHECK:         fdiv  d0, d0, d1
+; CHECK-NEXT:    fmul  d1, d2, d3
+; CHECK-NEXT:    fmul  d0, d0, d1
+; CHECK-NEXT:    ret
+  %t0 = fdiv double %x0, %x1
+  %t1 = fmul double %x2, %t0
+  %t2 = fmul double %x3, %t1
+  ret double %t2
+}
+
+; Verify that we reassociate vector instructions too.
+
+define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL:   vector_reassociate_adds1:
+; CHECK:         fadd  v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd  v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fadd  v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %t0, %x2
+  %t2 = fadd <4 x float> %t1, %x3
+  ret <4 x float> %t2
+}
+
+define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL:   vector_reassociate_adds2:
+; CHECK:         fadd  v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd  v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fadd  v0.4s, v0.4s, v1.4s
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %x2, %t0
+  %t2 = fadd <4 x float> %t1, %x3
+  ret <4 x float> %t2
+}
+
+define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL:   vector_reassociate_adds3:
+; CHECK:         fadd  v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd  v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fadd  v0.4s, v0.4s, v1.4s
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %t0, %x2
+  %t2 = fadd <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
+define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL:   vector_reassociate_adds4:
+; CHECK:         fadd  v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd  v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fadd  v0.4s, v0.4s, v1.4s
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fadd <4 x float> %x2, %t0
+  %t2 = fadd <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+; Verify that 128-bit vector single-precision multiplies are reassociated.
+
+define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL:   reassociate_muls_v4f32:
+; CHECK:         fadd  v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fmul  v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fmul  v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %t0 = fadd <4 x float> %x0, %x1
+  %t1 = fmul <4 x float> %x2, %t0
+  %t2 = fmul <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
+; Verify that 128-bit vector double-precision multiplies are reassociated.
+
+define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
+; CHECK-LABEL:   reassociate_muls_v2f64:
+; CHECK:         fadd  v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fmul  v1.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul  v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %t0 = fadd <2 x double> %x0, %x1
+  %t1 = fmul <2 x double> %x2, %t0
+  %t2 = fmul <2 x double> %x3, %t1
+  ret <2 x double> %t2
+}
+
+; PR25016: https://llvm.org/bugs/show_bug.cgi?id=25016
+; Verify that reassociation is not happening needlessly or wrongly.
+
+declare double @bar()
+
+define double @reassociate_adds_from_calls() {
+; CHECK-LABEL: reassociate_adds_from_calls:
+; CHECK:       bl   bar
+; CHECK-NEXT:  mov  v8.16b, v0.16b 
+; CHECK-NEXT:  bl   bar
+; CHECK-NEXT:  mov  v9.16b, v0.16b
+; CHECK-NEXT:  bl   bar
+; CHECK-NEXT:  mov  v10.16b, v0.16b 
+; CHECK-NEXT:  bl   bar
+; CHECK:       fadd d1, d8, d9 
+; CHECK-NEXT:  fadd d0, d10, d0
+; CHECK-NEXT:  fadd d0, d1, d0
+  %x0 = call double @bar()
+  %x1 = call double @bar()
+  %x2 = call double @bar()
+  %x3 = call double @bar()
+  %t0 = fadd double %x0, %x1
+  %t1 = fadd double %t0, %x2
+  %t2 = fadd double %t1, %x3
+  ret double %t2
+}
+
+define double @already_reassociated() {
+; CHECK-LABEL: already_reassociated:
+; CHECK:       bl   bar
+; CHECK-NEXT:  mov  v8.16b, v0.16b 
+; CHECK-NEXT:  bl   bar
+; CHECK-NEXT:  mov  v9.16b, v0.16b
+; CHECK-NEXT:  bl   bar
+; CHECK-NEXT:  mov  v10.16b, v0.16b 
+; CHECK-NEXT:  bl   bar
+; CHECK:       fadd d1, d8, d9 
+; CHECK-NEXT:  fadd d0, d10, d0
+; CHECK-NEXT:  fadd d0, d1, d0
+  %x0 = call double @bar()
+  %x1 = call double @bar()
+  %x2 = call double @bar()
+  %x3 = call double @bar()
+  %t0 = fadd double %x0, %x1
+  %t1 = fadd double %x2, %x3
+  %t2 = fadd double %t0, %t1
+  ret double %t2
+}
+
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
new file mode 100644
index 0000000000000..baedf47eef0d0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -0,0 +1,269 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
+declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
+declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}s_ctlz_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: s_flbit_i32_b32 [[CTLZ:s[0-9]+]], [[VAL]]
+; SI-DAG: v_cmp_eq_i32_e64 [[CMPZ:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; SI-DAG: v_mov_b32_e32 [[VCTLZ:v[0-9]+]], [[CTLZ]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[VCTLZ]], 32, [[CMPZ]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+
+; EG: FFBH_UINT
+; EG: CNDE_INT
+define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
+; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+
+; EG: FFBH_UINT
+; EG: CNDE_INT
+define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+
+; EG: FFBH_UINT
+; EG: CNDE_INT
+; EG: FFBH_UINT
+; EG: CNDE_INT
+define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
+  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+
+
+; EG-DAG: FFBH_UINT
+; EG-DAG: CNDE_INT
+
+; EG-DAG: FFBH_UINT
+; EG-DAG: CNDE_INT
+
+; EG-DAG: FFBH_UINT
+; EG-DAG: CNDE_INT
+
+; EG-DAG: FFBH_UINT
+; EG-DAG: CNDE_INT
+define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
+  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i8:
+; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
+; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
+; SI-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]]
+; SI: buffer_store_byte [[RESULT]],
+define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i8, i8 addrspace(1)* %valptr
+  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
+  store i8 %ctlz, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctlz_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]]
+; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
+; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
+; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
+; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
+; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
+  store i64 %ctlz, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
+define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
+  %trunc = trunc i64 %ctlz to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i64:
+; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
+; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
+; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
+; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
+; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
+; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
+; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
+; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
+; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
+define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
+  store i64 %ctlz, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
+define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
+  %trunc = trunc i64 %ctlz to i32
+  store i32 %trunc, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+ define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp ne i32 %val, 0
+  %sel = select i1 %cmp, i32 %ctlz, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; TODO: Should be able to eliminate select here as well.
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: s_endpgm
+define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp eq i32 %ctlz, 32
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: s_endpgm
+define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  %cmp = icmp ne i32 %ctlz, 32
+  %sel = select i1 %cmp, i32 %ctlz, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i8_sel_eq_neg1:
+; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; SI: buffer_store_byte [[FFBH]],
+ define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i8, i8 addrspace(1)* %valptr
+  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
+  %cmp = icmp eq i8 %val, 0
+  %sel = select i1 %cmp, i8 -1, i8 %ctlz
+  store i8 %sel, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; SI: buffer_store_short [[FFBH]],
+ define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i16, i16 addrspace(1)* %valptr
+  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
+  %cmp = icmp eq i16 %val, 0
+  %sel = select i1 %cmp, i16 -1, i16 %ctlz
+  store i16 %sel, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_i7_sel_eq_neg1:
+; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
+; SI: buffer_store_byte [[TRUNC]],
+ define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i7, i7 addrspace(1)* %valptr
+  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
+  %cmp = icmp eq i7 %val, 0
+  %sel = select i1 %cmp, i7 -1, i7 %ctlz
+  store i7 %sel, i7 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index bd26c302fe5a7..c1f84cd460cfd 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -2,10 +2,18 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
+declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
 ; SI: s_load_dword [[VAL:s[0-9]+]],
 ; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
@@ -69,3 +77,192 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
+; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[FFBH]]
+; SI: buffer_store_byte [[RESULT]],
+define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i8, i8 addrspace(1)* %valptr
+  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
+  store i8 %ctlz, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]]
+; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
+; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
+; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
+; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
+; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
+  store i64 %ctlz, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
+define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
+  %trunc = trunc i64 %ctlz to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
+; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
+; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
+; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
+; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
+; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
+; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
+  store i64 %ctlz, i64 addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
+define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep
+  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
+  %trunc = trunc i64 %ctlz to i32
+  store i32 %trunc, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI-NEXT: buffer_store_dword [[RESULT]],
+ define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI-NEXT: buffer_store_dword [[RESULT]],
+define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp ne i32 %val, 0
+  %sel = select i1 %cmp, i32 %ctlz, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8_sel_eq_neg1:
+; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; SI: buffer_store_byte [[FFBH]],
+ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i8, i8 addrspace(1)* %valptr
+  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i8 %val, 0
+  %sel = select i1 %cmp, i8 -1, i8 %ctlz
+  store i8 %sel, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
+; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[VAL]]
+; SI-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
+; SI-DAG: buffer_store_dword [[RESULT0]]
+; SI-DAG: buffer_store_byte [[RESULT1]]
+; SI: s_endpgm
+ define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 -1, i32 %ctlz
+  store volatile i32 %sel, i32 addrspace(1)* %out
+  store volatile i1 %cmp, i1 addrspace(1)* undef
+  ret void
+}
+
+; Selected on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+ define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 0
+  %sel = select i1 %cmp, i32 0, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; Selected on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp ne i32 %val, 0
+  %sel = select i1 %cmp, i32 %ctlz, i32 0
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; Compare on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+ define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp eq i32 %val, 1
+  %sel = select i1 %cmp, i32 0, i32 %ctlz
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; Selected on wrong constant
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
+; SI: buffer_load_dword
+; SI: v_ffbh_u32_e32
+; SI: v_cmp
+; SI: v_cndmask
+; SI: buffer_store_dword
+define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %cmp = icmp ne i32 %val, 1
+  %sel = select i1 %cmp, i32 %ctlz, i32 0
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 9aea7c7734314..b9489101f906b 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=CI --check-prefix=NO-XNACK
 ; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI --check-prefix=NO-XNACK
-; RUN: llc < %s -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=XNACK
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI --check-prefix=XNACK
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 ; NO-XNACK: ; NumSgprs: 8
@@ -22,8 +22,7 @@ entry:
 
 ; GCN-LABEL: {{^}}no_vcc_flat:
 ; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 12
-; XNACK: ; NumSgprs: 14
+; VI: ; NumSgprs: 14
 define void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
@@ -32,8 +31,7 @@ entry:
 
 ; GCN-LABEL: {{^}}vcc_flat:
 ; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 12
-; XNACK: ; NumSgprs: 14
+; VI: ; NumSgprs: 14
 define void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 52fc3d0d251a4..69a0a520a476c 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -8,8 +8,8 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f32
 ; EG: MIN *
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-NONAN: v_min_f32_e32
+; SI-SAFE: v_min_legacy_f32_e64
+; SI-NONAN: v_min_f32_e64
 define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
index dfe41cb5b1111..38d573258a5e2 100644
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -32,9 +32,8 @@ declare void @llvm.AMDGPU.store.output(float, i32)
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
 
-; FIXME: Should be using SGPR directly for first operand
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
   %sub = fsub <2 x float> %a, %b
   store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
@@ -60,13 +59,11 @@ define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(
   ret void
 }
 
-; FIXME: Should be using SGPR directly for first operand
-
 ; FUNC-LABEL: {{^}}s_fsub_v4f32:
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: s_endpgm
 define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
   %result = fsub <4 x float> %a, %b
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
index 1d76c40c042e8..90322ac3dc018 100644
--- a/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -17,41 +17,49 @@ define void @test() {
 }
 
 ; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .size internal_global_program, 4
 ; ASM: .hsadata_global_program
 ; ASM: internal_global_program:
 ; ASM: .long 0
 
 ; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .size common_global_program, 4
 ; ASM: .hsadata_global_program
 ; ASM: common_global_program:
 ; ASM: .long 0
 
 ; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .size external_global_program, 4
 ; ASM: .hsadata_global_program
 ; ASM: external_global_program:
 ; ASM: .long 0
 
 ; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .size internal_global_agent, 4
 ; ASM: .hsadata_global_agent
 ; ASM: internal_global_agent:
 ; ASM: .long 0
 
 ; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .size common_global_agent, 4
 ; ASM: .hsadata_global_agent
 ; ASM: common_global_agent:
 ; ASM: .long 0
 
 ; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .size external_global_agent, 4
 ; ASM: .hsadata_global_agent
 ; ASM: external_global_agent:
 ; ASM: .long 0
 
 ; ASM: .amdgpu_hsa_module_global internal_readonly
+; ASM: .size internal_readonly, 4
 ; ASM: .hsatext
 ; ASM: internal_readonly:
 ; ASM: .long 0
 
 ; ASM: .amdgpu_hsa_program_global external_readonly
+; ASM: .size external_readonly, 4
 ; ASM: .hsatext
 ; ASM: external_readonly:
 ; ASM: .long 0
@@ -79,18 +87,21 @@ define void @test() {
 
 ; ELF: Symbol {
 ; ELF: Name: common_global_agent
+; ELF: Size: 4
 ; ELF: Binding: Local
 ; ELF: Section: .hsadata_global_agent
 ; ELF: }
 
 ; ELF: Symbol {
 ; ELF: Name: common_global_program
+; ELF: Size: 4
 ; ELF: Binding: Local
 ; ELF: Section: .hsadata_global_program
 ; ELF: }
 
 ; ELF: Symbol {
 ; ELF: Name: internal_global_agent
+; ELF: Size: 4
 ; ELF: Binding: Local
 ; ELF: Type: Object
 ; ELF: Section: .hsadata_global_agent
@@ -98,6 +109,7 @@ define void @test() {
 
 ; ELF: Symbol {
 ; ELF: Name: internal_global_program
+; ELF: Size: 4
 ; ELF: Binding: Local
 ; ELF: Type: Object
 ; ELF: Section: .hsadata_global_program
@@ -105,6 +117,7 @@ define void @test() {
 
 ; ELF: Symbol {
 ; ELF: Name: internal_readonly
+; ELF: Size: 4
 ; ELF: Binding: Local
 ; ELF: Type: Object
 ; ELF: Section: .hsatext
@@ -112,6 +125,7 @@ define void @test() {
 
 ; ELF: Symbol {
 ; ELF: Name: external_global_agent
+; ELF: Size: 4
 ; ELF: Binding: Global
 ; ELF: Type: Object
 ; ELF: Section: .hsadata_global_agent
@@ -119,6 +133,7 @@ define void @test() {
 
 ; ELF: Symbol {
 ; ELF: Name: external_global_program
+; ELF: Size: 4
 ; ELF: Binding: Global
 ; ELF: Type: Object
 ; ELF: Section: .hsadata_global_program
@@ -126,6 +141,7 @@ define void @test() {
 
 ; ELF: Symbol {
 ; ELF: Name: external_readonly
+; ELF: Size: 4
 ; ELF: Binding: Global
 ; ELF: Type: Object
 ; ELF: Section: .hsatext
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
new file mode 100644
index 0000000000000..0e4662231b4fb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s
+
+; HSA: .hsa_code_object_version 1,0
+; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
+; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index abc89b7fd837c..c089dfd9a9718 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -28,6 +28,7 @@
 
 ; ELF: Symbol {
 ; ELF: Name: simple
+; ELF: Size: 296
 ; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
 ; ELF: }
 
@@ -52,6 +53,9 @@
 ; Make sure we generate flat store for HSA
 ; HSA: flat_store_dword v{{[0-9]+}}
 
+; HSA: .Lfunc_end0:
+; HSA: .size   simple, .Lfunc_end0-simple
+
 define void @simple(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index efc2292de3a52..9c8d3534f8adb 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -10,3 +10,14 @@ entry:
   call void asm sideeffect "s_endpgm", ""()
   ret void
 }
+
+; CHECK: {{^}}inline_asm_shader:
+; CHECK: s_endpgm
+; CHECK: s_endpgm
+define void @inline_asm_shader() #0 {
+entry:
+  call void asm sideeffect "s_endpgm", ""()
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index dc95cd1ee012f..d96ea743f6edb 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -1,4 +1,7 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: unsupported hsa intrinsic without hsa target in test
 
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 6b365dc09e2a9..98afbeee93e6c 100644
--- a/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -21,7 +21,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 ; SI-DAG: v_cmp_eq_i32
 
 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
-; SI-DAG: v_cmp_gt_i32_e32
+; SI-DAG: v_cmp_gt_i32
 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
 
 ; SI: buffer_store_dwordx2
diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll
new file mode 100644
index 0000000000000..2bd9fd6858fec
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ret.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+attributes #0 = { "ShaderType"="1" }
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+; GCN-LABEL: {{^}}vgpr:
+; GCN: v_mov_b32_e32 v1, v0
+; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
+; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN: s_waitcnt expcnt(0)
+; GCN-NOT: s_endpgm
+define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  %x = fadd float %3, 1.0
+  %a = insertvalue {float, float} undef, float %x, 0
+  %b = insertvalue {float, float} %a, float %3, 1
+  ret {float, float} %b
+}
+
+; GCN-LABEL: {{^}}vgpr_literal:
+; GCN: v_mov_b32_e32 v4, v0
+; GCN-DAG: v_mov_b32_e32 v0, 1.0
+; GCN-DAG: v_mov_b32_e32 v1, 2.0
+; GCN-DAG: v_mov_b32_e32 v2, 4.0
+; GCN-DAG: v_mov_b32_e32 v3, -1.0
+; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
+; GCN: s_waitcnt expcnt(0)
+; GCN-NOT: s_endpgm
+define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
+}
+
+
+; GCN: .long 165580
+; GCN-NEXT: .long 562
+; GCN-NEXT: .long 165584
+; GCN-NEXT: .long 562
+; GCN-LABEL: {{^}}vgpr_ps_addr0:
+; GCN-NOT: v_mov_b32_e32 v0
+; GCN-NOT: v_mov_b32_e32 v1
+; GCN-NOT: v_mov_b32_e32 v2
+; GCN: v_mov_b32_e32 v3, v4
+; GCN: v_mov_b32_e32 v4, v6
+; GCN-NOT: s_endpgm
+attributes #1 = { "ShaderType"="0" "InitialPSInputAddr"="0" }
+define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN: .long 165580
+; GCN-NEXT: .long 1
+; GCN-NEXT: .long 165584
+; GCN-NEXT: .long 1
+; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
+; GCN: v_mov_b32_e32 v0, 1.0
+; GCN-NOT: s_endpgm
+define float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+  ret float 1.0
+}
+
+
+; GCN: .long 165580
+; GCN-NEXT: .long 2081
+; GCN-NEXT: .long 165584
+; GCN-NEXT: .long 2081
+; GCN-LABEL: {{^}}ps_input_ena_pos_w:
+; GCN-DAG: v_mov_b32_e32 v0, v4
+; GCN-DAG: v_mov_b32_e32 v1, v2
+; GCN: v_mov_b32_e32 v2, v3
+; GCN-NOT: s_endpgm
+define {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+  %f = bitcast <2 x i32> %8 to <2 x float>
+  %s = insertvalue {float, <2 x float>} undef, float %14, 0
+  %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1
+  ret {float, <2 x float>} %s1
+}
+
+
+; GCN: .long 165580
+; GCN-NEXT: .long 562
+; GCN-NEXT: .long 165584
+; GCN-NEXT: .long 563
+; GCN-LABEL: {{^}}vgpr_ps_addr1:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mov_b32_e32 v2, v4
+; GCN-DAG: v_mov_b32_e32 v3, v6
+; GCN-DAG: v_mov_b32_e32 v4, v8
+; GCN-NOT: s_endpgm
+attributes #2 = { "ShaderType"="0" "InitialPSInputAddr"="1" }
+define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN: .long 165580
+; GCN-NEXT: .long 562
+; GCN-NEXT: .long 165584
+; GCN-NEXT: .long 631
+; GCN-LABEL: {{^}}vgpr_ps_addr119:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mov_b32_e32 v2, v6
+; GCN: v_mov_b32_e32 v3, v8
+; GCN: v_mov_b32_e32 v4, v12
+; GCN-NOT: s_endpgm
+attributes #3 = { "ShaderType"="0" "InitialPSInputAddr"="119" }
+define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN: .long 165580
+; GCN-NEXT: .long 562
+; GCN-NEXT: .long 165584
+; GCN-NEXT: .long 946
+; GCN-LABEL: {{^}}vgpr_ps_addr418:
+; GCN-NOT: v_mov_b32_e32 v0
+; GCN-NOT: v_mov_b32_e32 v1
+; GCN-NOT: v_mov_b32_e32 v2
+; GCN: v_mov_b32_e32 v3, v4
+; GCN: v_mov_b32_e32 v4, v8
+; GCN-NOT: s_endpgm
+attributes #4 = { "ShaderType"="0" "InitialPSInputAddr"="418" }
+define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN-LABEL: {{^}}sgpr:
+; GCN: s_add_i32 s0, s3, 2
+; GCN: s_mov_b32 s2, s3
+; GCN-NOT: s_endpgm
+define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  %x = add i32 %2, 2
+  %a = insertvalue {i32, i32, i32} undef, i32 %x, 0
+  %b = insertvalue {i32, i32, i32} %a, i32 %1, 1
+  %c = insertvalue {i32, i32, i32} %a, i32 %2, 2
+  ret {i32, i32, i32} %c
+}
+
+
+; GCN-LABEL: {{^}}sgpr_literal:
+; GCN: s_mov_b32 s0, 5
+; GCN-NOT: s_mov_b32 s0, s0
+; GCN-DAG: s_mov_b32 s1, 6
+; GCN-DAG: s_mov_b32 s2, 7
+; GCN-DAG: s_mov_b32 s3, 8
+; GCN-NOT: s_endpgm
+define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  %x = add i32 %2, 2
+  ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
+}
+
+
+; GCN-LABEL: {{^}}both:
+; GCN: v_mov_b32_e32 v1, v0
+; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
+; GCN-DAG: s_add_i32 s0, s3, 2
+; GCN-DAG: s_mov_b32 s1, s2
+; GCN: s_mov_b32 s2, s3
+; GCN: s_waitcnt expcnt(0)
+; GCN-NOT: s_endpgm
+define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  %v = fadd float %3, 1.0
+  %s = add i32 %2, 2
+  %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0
+  %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1
+  %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2
+  %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3
+  %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4
+  ret {float, i32, float, i32, i32} %a4
+}
+
+
+; GCN-LABEL: {{^}}structure_literal:
+; GCN: v_mov_b32_e32 v3, v0
+; GCN-DAG: v_mov_b32_e32 v0, 1.0
+; GCN-DAG: s_mov_b32 s0, 2
+; GCN-DAG: s_mov_b32 s1, 3
+; GCN-DAG: v_mov_b32_e32 v1, 2.0
+; GCN-DAG: v_mov_b32_e32 v2, 4.0
+; GCN-DAG: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
+define {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
+}
diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll
new file mode 100644
index 0000000000000..66a9571d75bf0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn -mcpu=SI --misched=si < %s | FileCheck %s
+
+; The test checks the "si" machine scheduler pass works correctly.
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_wqm
+; CHECK: s_load_dwordx4
+; CHECK: s_load_dwordx8
+; CHECK: s_waitcnt lgkmcnt(0)
+; CHECK: image_sample
+; CHECK: s_waitcnt vmcnt(0)
+; CHECK: exp
+; CHECK: s_endpgm
+
+define void @main([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, 
+<2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+main_body:
+  %22 = bitcast [34 x <8 x i32>] addrspace(2)* %3 to <32 x i8> addrspace(2)*
+  %23 = load <32 x i8>, <32 x i8> addrspace(2)* %22, align 32, !tbaa !0
+  %24 = bitcast [17 x <4 x i32>] addrspace(2)* %2 to <16 x i8> addrspace(2)*
+  %25 = load <16 x i8>, <16 x i8> addrspace(2)* %24, align 16, !tbaa !0
+  %26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %11)
+  %27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %11)
+  %28 = bitcast float %26 to i32
+  %29 = bitcast float %27 to i32
+  %30 = insertelement <2 x i32> undef, i32 %28, i32 0
+  %31 = insertelement <2 x i32> %30, i32 %29, i32 1
+  %32 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %31, <32 x i8> %23, <16 x i8> %25, i32 2)
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = extractelement <4 x float> %32, i32 1
+  %35 = extractelement <4 x float> %32, i32 2
+  %36 = extractelement <4 x float> %32, i32 3
+  %37 = call i32 @llvm.SI.packf16(float %33, float %34)
+  %38 = bitcast i32 %37 to float
+  %39 = call i32 @llvm.SI.packf16(float %35, float %36)
+  %40 = bitcast i32 %39 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %38, float %40, float %38, float %40)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
new file mode 100644
index 0000000000000..138b93b16d8d1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -0,0 +1,62 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+
+; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
+
+; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32:
+define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+  %result = sitofp i64 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sint_to_fp_i64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2
+
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 63
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\]}}, 63, {{v\[[0-9]+:[0-9]+\]}}
+; GCN: v_xor_b32
+
+; GCN: v_ffbh_u32
+; GCN: v_ffbh_u32
+; GCN: v_cndmask
+; GCN: v_cndmask
+
+; GCN-DAG: v_cmp_eq_i64
+; GCN-DAG: v_cmp_lt_u64
+
+; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 [[SIGN_SEL:v[0-9]+]],
+; GCN: {{buffer|flat}}_store_dword [[SIGN_SEL]]
+define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep
+  %result = sitofp i64 %val to float
+  store float %result, float addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64:
+define void @s_sint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+  %result = sitofp <2 x i64> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64:
+define void @v_sint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
+  %value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
+  %result = sitofp <4 x i64> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
index 8506441d13615..851085c9535d0 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -2,63 +2,120 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
-; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
-define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = sitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}sint_to_fp_v2i32:
-; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+; FUNC-LABEL: {{^}}v_sint_to_fp_i32_to_f32:
+; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
+; R600: INT_TO_FLT
+define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
+  %result = sitofp i32 %val to float
+  store float %result, float addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sint_to_fp_v2i32:
 ; SI: v_cvt_f32_i32_e32
 ; SI: v_cvt_f32_i32_e32
-define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
   %result = sitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}sint_to_fp_v4i32:
+; FUNC-LABEL: {{^}}s_sint_to_fp_v4i32_to_v4f32:
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: s_endpgm
+
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = sitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
 
+; FUNC-LABEL: {{^}}v_sint_to_fp_v4i32:
 ; SI: v_cvt_f32_i32_e32
 ; SI: v_cvt_f32_i32_e32
 ; SI: v_cvt_f32_i32_e32
 ; SI: v_cvt_f32_i32_e32
-define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
+
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
+  %value = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep
   %result = sitofp <4 x i32> %value to <4 x float>
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, <4 x float> addrspace(1)* %out.gep
   ret void
 }
 
-; FUNC-LABEL: {{^}}sint_to_fp_i1_f32:
+; FUNC-LABEL: {{^}}s_sint_to_fp_i1_f32:
 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
-  store float %fp, float addrspace(1)* %out, align 4
+  store float %fp, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load:
+; FUNC-LABEL: {{^}}s_sint_to_fp_i1_f32_load:
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = sitofp i1 %in to float
-  store float %fp, float addrspace(1)* %out, align 4
+  store float %fp, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_sint_to_fp_i1_f32_load:
+; SI: {{buffer|flat}}_load_ubyte
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: v_cmp_eq_i32
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
+; SI: {{buffer|flat}}_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %val = load i1, i1 addrspace(1)* %in.gep
+  %fp = sitofp i1 %val to float
+  store float %fp, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index de22a22e50290..2a09e0b204987 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,12 +1,11 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-;EG-LABEL: {{^}}test:
-;EG-NOT: SETGE_INT
-;EG: CF_END
-
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+; FUNC-LABEL: {{^}}udiv_i32:
+; EG-NOT: SETGE_INT
+; EG: CF_END
+define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1) * %in
   %b = load i32, i32 addrspace(1) * %b_ptr
@@ -15,16 +14,24 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-;The code generated by udiv is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 udiv
+; FUNC-LABEL: {{^}}s_udiv_i32:
+
+define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %result = udiv i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; The code generated by udiv is long and complex and may frequently
+; change. The goal of this test is to make sure the ISel doesn't fail
+; when it gets a v4i32 udiv
 
-;EG-LABEL: {{^}}test2:
-;EG: CF_END
-;SI-LABEL: {{^}}test2:
-;SI: s_endpgm
+; FUNC-LABEL: {{^}}udiv_v2i32:
+; EG: CF_END
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+; SI: s_endpgm
+define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -33,12 +40,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-LABEL: {{^}}test4:
-;EG: CF_END
-;SI-LABEL: {{^}}test4:
-;SI: s_endpgm
-
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; FUNC-LABEL: {{^}}udiv_v4i32:
+; EG: CF_END
+; SI: s_endpgm
+define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -46,3 +51,43 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}udiv_i32_div_pow2:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
+; SI: buffer_store_dword [[RESULT]]
+define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %result = udiv i32 %a, 16
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv_i32_div_k_even:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfabbd9c1
+; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
+; SI: buffer_store_dword [[RESULT]]
+define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %result = udiv i32 %a, 34259182
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv_i32_div_k_odd:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7d5deca3
+; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
+; SI: buffer_store_dword [[RESULT]]
+define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %result = udiv i32 %a, 34259183
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
new file mode 100644
index 0000000000000..3ab11442d5cc3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+
+; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
+
+; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
+define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+  %result = uitofp i64 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uint_to_fp_i64_to_f32:
+; GCN: {{buffer|flat}}_load_dwordx2
+
+; GCN: v_ffbh_u32
+; GCN: v_ffbh_u32
+; GCN: v_cndmask
+; GCN: v_cndmask
+
+; GCN-DAG: v_cmp_eq_i64
+; GCN-DAG: v_cmp_lt_u64
+
+; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
+; GCN: {{buffer|flat}}_store_dword [[VR]]
+define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep
+  %result = uitofp i64 %val to float
+  store float %result, float addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64:
+define void @s_uint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+  %result = uitofp <2 x i64> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64:
+define void @v_uint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
+  %value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
+  %result = uitofp <4 x i64> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
index 00fea80b1bc83..a3343d1e2d9cb 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -2,81 +2,138 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32:
-; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-
+; FUNC-LABEL: {{^}}s_uint_to_fp_i32_to_f32:
 ; SI: v_cvt_f32_u32_e32
-; SI: s_endpgm
-define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = uitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32:
-; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+; FUNC-LABEL: {{^}}v_uint_to_fp_i32_to_f32:
+; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
+
+; R600: INT_TO_FLT
+define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
+  %result = uitofp i32 %val to float
+  store float %result, float addrspace(1)* %out.gep
+  ret void
+}
 
+; FUNC-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f32:
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
-; SI: s_endpgm
-define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32:
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; FUNC-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f32:
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
 ; SI: s_endpgm
-define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32:
-; R600: UINT_TO_FLT
-; R600: UINT_TO_FLT
-; R600: MULADD_IEEE
+; FUNC-LABEL: {{^}}v_uint_to_fp_v4i32:
 ; SI: v_cvt_f32_u32_e32
 ; SI: v_cvt_f32_u32_e32
-; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000
-; SI: s_endpgm
-define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) {
-entry:
-  %0 = uitofp i64 %in to float
-  store float %0, float addrspace(1)* %out
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
+  %value = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep
+  %result = uitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out.gep
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32:
+; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f32:
 ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) {
+define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
-  store float %fp, float addrspace(1)* %out, align 4
+  store float %fp, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load:
+; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f32_load:
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) {
+define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = uitofp i1 %in to float
-  store float %fp, float addrspace(1)* %out, align 4
+  store float %fp, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uint_to_fp_i1_f32_load:
+; SI: {{buffer|flat}}_load_ubyte
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: v_cmp_eq_i32
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
+; SI: {{buffer|flat}}_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %val = load i1, i1 addrspace(1)* %in.gep
+  %fp = uitofp i1 %val to float
+  store float %fp, float addrspace(1)* %out.gep
   ret void
 }
+
+; FIXME: Repeated here to test r600
+; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
+; R600: FFBH_UINT
+; R600: FFBH_UINT
+; R600: CNDE_INT
+; R600: CNDE_INT
+
+; R600-DAG: SETGT_UINT
+; R600-DAG: SETGT_UINT
+; R600-DAG: SETE_INT
+
+define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+entry:
+  %cvt = uitofp i64 %in to float
+  store float %cvt, float addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/ARM/bit-reverse-to-rbit.ll b/test/CodeGen/ARM/bit-reverse-to-rbit.ll
new file mode 100644
index 0000000000000..8482cbf69f2e0
--- /dev/null
+++ b/test/CodeGen/ARM/bit-reverse-to-rbit.ll
@@ -0,0 +1,34 @@
+;RUN: opt -instcombine -S < %s | llc -mtriple=armv5e--linux-gnueabi | FileCheck %s
+;RUN: opt -instcombine -S < %s | llc -mtriple=thumbv4t--linux-gnueabi | FileCheck %s
+;RUN: opt -instcombine -S < %s | llc -mtriple=armv6--linux-gnueabi | FileCheck %s
+
+;RUN: opt -instcombine -S < %s | llc -mtriple=armv7--linux-gnueabi | FileCheck %s --check-prefix=RBIT
+;RUN: opt -instcombine -S < %s | llc -mtriple=thumbv8--linux-gnueabi | FileCheck %s --check-prefix=RBIT
+
+;CHECK-NOT: rbit
+;RBIT: rbit
+
+define void @byte_reversal(i8* %p, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %0
+  %1 = load i8, i8* %arrayidx, align 1
+  %or19 = call i8 @llvm.bitreverse.i8(i8 %1)
+  store i8 %or19, i8* %arrayidx, align 1
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i8 @llvm.bitreverse.i8(i8)
diff --git a/test/CodeGen/ARM/cxx-tlscc.ll b/test/CodeGen/ARM/cxx-tlscc.ll
new file mode 100644
index 0000000000000..7b776d4b8e88c
--- /dev/null
+++ b/test/CodeGen/ARM/cxx-tlscc.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=armv7k-apple-watchos2.0 | FileCheck %s
+; RUN: llc < %s -mtriple=armv7k-apple-watchos2.0 -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; RUN: llc < %s -mtriple=armv7-apple-ios8.0 | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-ios8.0 -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
+declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  %call.i.i = tail call %struct.S* @_ZN1SC1Ev(%struct.S* nonnull @sg)
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (%struct.S* (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle)
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}
+
+; CHECK-LABEL: _ZTW2sg
+; CHECK: push {lr}
+; CHECK-NOT: push {r1, r2, r3, r4, r7, lr}
+; CHECK-NOT: push {r9, r12}
+; CHECK-NOT: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+; CHECK-NOT: vpush {d0, d1, d2, d3, d4, d5, d6, d7}
+; CHECK: blx
+; CHECK: bne [[BB_end:.?LBB0_[0-9]+]]
+; CHECK; blx
+; CHECK: tlv_atexit
+; CHECK: [[BB_end]]:
+; CHECK: blx
+; CHECK-NOT: vpop {d0, d1, d2, d3, d4, d5, d6, d7}
+; CHECK-NOT: vpop {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+; CHECK-NOT: pop {r9, r12}
+; CHECK-NOT: pop {r1, r2, r3, r4, r7, pc}
+; CHECK: pop {lr}
diff --git a/test/CodeGen/ARM/darwin-tls.ll b/test/CodeGen/ARM/darwin-tls.ll
new file mode 100644
index 0000000000000..e19953222020b
--- /dev/null
+++ b/test/CodeGen/ARM/darwin-tls.ll
@@ -0,0 +1,165 @@
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - -fast-isel %s | FileCheck %s --check-prefix=T2-MOVT-PIC
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s -mattr=+no-movt | FileCheck %s --check-prefix=T2-LIT-PIC
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s -relocation-model=static | FileCheck %s --check-prefix=T2-MOVT-STATIC
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s -mattr=+no-movt -relocation-model=static | FileCheck %s --check-prefix=T2-LIT-STATIC
+; RUN: llc -mtriple=armv7s-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=ARM-MOVT-PIC
+; RUN: llc -mtriple=armv7s-apple-ios7.0 -o - %s -mattr=+no-movt | FileCheck %s --check-prefix=ARM-LIT-PIC
+; RUN: llc -mtriple=armv7s-apple-ios7.0 -o - %s -relocation-model=static | FileCheck %s --check-prefix=ARM-MOVT-STATIC
+; RUN: llc -mtriple=armv7s-apple-ios7.0 -o - %s -mattr=+no-movt -relocation-model=static | FileCheck %s --check-prefix=ARM-LIT-STATIC
+
+
+@local_tls_var = thread_local global i32 0
+@external_tls_var = external thread_local global i32
+
+define i32 @test_local_tls() {
+; T2-MOVT-PIC-LABEL: test_local_tls:
+; T2-MOVT-PIC: movw r0, :lower16:(_local_tls_var-([[PCREL_LOC:LPC[0-9]+_[0-9]+]]+4))
+; T2-MOVT-PIC: movt r0, :upper16:(_local_tls_var-([[PCREL_LOC]]+4))
+; T2-MOVT-PIC: [[PCREL_LOC]]:
+; T2-MOVT-PIC-NEXT: add r0, pc
+; T2-MOVT-PIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-MOVT-PIC: blx [[TLV_GET_ADDR]]
+; T2-MOVT-PIC: ldr r0, [r0]
+
+; T2-LIT-PIC-LABEL: test_local_tls:
+; T2-LIT-PIC: ldr r0, [[LOCAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; T2-LIT-PIC: [[PCREL_LOC:LPC[0-9]+_[0-9]+]]:
+; T2-LIT-PIC-NEXT: add r0, pc
+; T2-LIT-PIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-LIT-PIC: blx [[TLV_GET_ADDR]]
+; T2-LIT-PIC: ldr r0, [r0]
+; T2-LIT-PIC: [[LOCAL_VAR_ADDR]]:
+; T2-LIT-PIC-NEXT: .long _local_tls_var-([[PCREL_LOC]]+4)
+
+; T2-MOVT-STATIC-LABEL: test_local_tls:
+; T2-MOVT-STATIC: movw r0, :lower16:_local_tls_var
+; T2-MOVT-STATIC: movt r0, :upper16:_local_tls_var
+; T2-MOVT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-MOVT-STATIC: blx [[TLV_GET_ADDR]]
+; T2-MOVT-STATIC: ldr r0, [r0]
+
+; T2-LIT-STATIC-LABEL: test_local_tls:
+; T2-LIT-STATIC: ldr r0, [[LOCAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; T2-LIT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-LIT-STATIC: blx [[TLV_GET_ADDR]]
+; T2-LIT-STATIC: ldr r0, [r0]
+; T2-LIT-STATIC: [[LOCAL_VAR_ADDR]]:
+; T2-LIT-STATIC-NEXT: .long _local_tls_var
+
+; ARM-MOVT-PIC-LABEL: test_local_tls:
+; ARM-MOVT-PIC: movw [[VARPC1:r[0-9]+]], :lower16:(_local_tls_var-([[PCREL_LOC1:LPC[0-9]+_[0-9]+]]+8))
+; ARM-MOVT-PIC: movt [[VARPC1]], :upper16:(_local_tls_var-([[PCREL_LOC1]]+8))
+; ARM-MOVT-PIC: [[PCREL_LOC1]]:
+; ARM-MOVT-PIC: add r0, pc, [[VARPC1]]
+; ARM-MOVT-PIC: movw [[VARPC2:r[0-9]+]], :lower16:(_local_tls_var-([[PCREL_LOC2:LPC[0-9]+_[0-9]+]]+8))
+; ARM-MOVT-PIC: movt [[VARPC2]], :upper16:(_local_tls_var-([[PCREL_LOC2]]+8))
+; ARM-MOVT-PIC: [[PCREL_LOC2]]:
+; ARM-MOVT-PIC-NEXT: ldr [[TLV_GET_ADDR:r[0-9]+]], [pc, [[VARPC2]]]
+; ARM-MOVT-PIC: blx [[TLV_GET_ADDR]]
+; ARM-MOVT-PIC: ldr r0, [r0]
+
+; ARM-LIT-PIC-LABEL: test_local_tls:
+; ARM-LIT-PIC: ldr r0, [[LOCAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; ARM-LIT-PIC: [[PCREL_LOC:LPC[0-9]+_[0-9]+]]:
+; ARM-LIT-PIC-NEXT: add r0, pc
+; ARM-LIT-PIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; ARM-LIT-PIC: blx [[TLV_GET_ADDR]]
+; ARM-LIT-PIC: ldr r0, [r0]
+; ARM-LIT-PIC: [[LOCAL_VAR_ADDR]]:
+; ARM-LIT-PIC-NEXT: .long _local_tls_var-([[PCREL_LOC]]+8)
+
+; ARM-MOVT-STATIC-LABEL: test_local_tls:
+; ARM-MOVT-STATIC: movw r0, :lower16:_local_tls_var
+; ARM-MOVT-STATIC: movt r0, :upper16:_local_tls_var
+; ARM-MOVT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; ARM-MOVT-STATIC: blx [[TLV_GET_ADDR]]
+; ARM-MOVT-STATIC: ldr r0, [r0]
+
+; ARM-LIT-STATIC-LABEL: test_local_tls:
+; ARM-LIT-STATIC: ldr r0, [[LOCAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; ARM-LIT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; ARM-LIT-STATIC: blx [[TLV_GET_ADDR]]
+; ARM-LIT-STATIC: ldr r0, [r0]
+; ARM-LIT-STATIC: [[LOCAL_VAR_ADDR]]:
+; ARM-LIT-STATIC-NEXT: .long _local_tls_var
+
+
+  %val = load i32, i32* @local_tls_var, align 4
+  ret i32 %val
+}
+
+define i32 @test_external_tls() {
+; T2-MOVT-PIC-LABEL: test_external_tls:
+; T2-MOVT-PIC: movw r[[EXTGOT:[0-9]+]], :lower16:(L_external_tls_var$non_lazy_ptr-([[PCREL_LOC:LPC[0-9]+_[0-9]+]]+4))
+; T2-MOVT-PIC: movt r[[EXTGOT]], :upper16:(L_external_tls_var$non_lazy_ptr-([[PCREL_LOC]]+4))
+; T2-MOVT-PIC: [[PCREL_LOC]]:
+; T2-MOVT-PIC-NEXT: add r[[EXTGOT]], pc
+; T2-MOVT-PIC: ldr r0, [r[[EXTGOT]]]
+; T2-MOVT-PIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-MOVT-PIC: blx [[TLV_GET_ADDR]]
+; T2-MOVT-PIC: ldr r0, [r0]
+
+; T2-LIT-PIC-LABEL: test_external_tls:
+; T2-LIT-PIC: ldr r[[EXTGOT:[0-9]+]], [[EXTERNAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; T2-LIT-PIC: [[PCREL_LOC:LPC[0-9]+_[0-9]+]]:
+; T2-LIT-PIC-NEXT: add r[[EXTGOT]], pc
+; T2-LIT-PIC: ldr r0, [r[[EXTGOT]]]
+; T2-LIT-PIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-LIT-PIC: blx [[TLV_GET_ADDR]]
+; T2-LIT-PIC: ldr r0, [r0]
+; T2-LIT-PIC: [[EXTERNAL_VAR_ADDR]]:
+; T2-LIT-PIC-NEXT: .long L_external_tls_var$non_lazy_ptr-([[PCREL_LOC]]+4)
+
+; T2-MOVT-STATIC-LABEL: test_external_tls:
+; T2-MOVT-STATIC: movw r0, :lower16:_external_tls_var
+; T2-MOVT-STATIC: movt r0, :upper16:_external_tls_var
+; T2-MOVT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-MOVT-STATIC: blx [[TLV_GET_ADDR]]
+; T2-MOVT-STATIC: ldr r0, [r0]
+
+; T2-LIT-STATIC-LABEL: test_external_tls:
+; T2-LIT-STATIC: ldr r0, [[EXTERNAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; T2-LIT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; T2-LIT-STATIC: blx [[TLV_GET_ADDR]]
+; T2-LIT-STATIC: ldr r0, [r0]
+; T2-LIT-STATIC: [[EXTERNAL_VAR_ADDR]]:
+; T2-LIT-STATIC-NEXT: .long _external_tls_var
+
+; ARM-MOVT-PIC-LABEL: test_external_tls:
+; ARM-MOVT-PIC: movw r[[EXTGOT:[0-9]+]], :lower16:(L_external_tls_var$non_lazy_ptr-([[PCREL_LOC:LPC[0-9]+_[0-9]+]]+8))
+; ARM-MOVT-PIC: movt r[[EXTGOT]], :upper16:(L_external_tls_var$non_lazy_ptr-([[PCREL_LOC]]+8))
+; ARM-MOVT-PIC: [[PCREL_LOC]]:
+; ARM-MOVT-PIC-NEXT: ldr r0, [pc, r[[EXTGOT]]]
+; ARM-MOVT-PIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; ARM-MOVT-PIC: blx [[TLV_GET_ADDR]]
+; ARM-MOVT-PIC: ldr r0, [r0]
+
+; ARM-LIT-PIC-LABEL: test_external_tls:
+; ARM-LIT-PIC: ldr r[[EXTGOT:[0-9]+]], [[EXTERNAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; ARM-LIT-PIC: [[PCREL_LOC:LPC[0-9]+_[0-9]+]]:
+; ARM-LIT-PIC-NEXT: add r[[EXTGOT]], pc
+; ARM-LIT-PIC: ldr r0, [r[[EXTGOT]]]
+; ARM-LIT-PIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; ARM-LIT-PIC: blx [[TLV_GET_ADDR]]
+; ARM-LIT-PIC: ldr r0, [r0]
+; ARM-LIT-PIC: [[EXTERNAL_VAR_ADDR]]:
+; ARM-LIT-PIC-NEXT: .long L_external_tls_var$non_lazy_ptr-([[PCREL_LOC]]+8)
+
+; ARM-MOVT-STATIC-LABEL: test_external_tls:
+; ARM-MOVT-STATIC: movw r0, :lower16:_external_tls_var
+; ARM-MOVT-STATIC: movt r0, :upper16:_external_tls_var
+; ARM-MOVT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; ARM-MOVT-STATIC: blx [[TLV_GET_ADDR]]
+; ARM-MOVT-STATIC: ldr r0, [r0]
+
+; ARM-LIT-STATIC-LABEL: test_external_tls:
+; ARM-LIT-STATIC: ldr r0, [[EXTERNAL_VAR_ADDR:LCPI[0-9]+_[0-9]+]]
+; ARM-LIT-STATIC: ldr [[TLV_GET_ADDR:r[0-9]+]], [r0]
+; ARM-LIT-STATIC: blx [[TLV_GET_ADDR]]
+; ARM-LIT-STATIC: ldr r0, [r0]
+; ARM-LIT-STATIC: [[EXTERNAL_VAR_ADDR]]:
+; ARM-LIT-STATIC-NEXT: .long _external_tls_var
+
+  %val = load i32, i32* @external_tls_var, align 4
+  ret i32 %val
+}
diff --git a/test/CodeGen/ARM/fabs-to-bfc.ll b/test/CodeGen/ARM/fabs-to-bfc.ll
new file mode 100644
index 0000000000000..1a2e04584a913
--- /dev/null
+++ b/test/CodeGen/ARM/fabs-to-bfc.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=armv5e-none-linux-gnueabi -mattr=+vfp2  | FileCheck %s -check-prefix=CHECK-VABS
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi  -mattr=+vfp3 | FileCheck  %s -check-prefix=CHECK-BFC
+
+
+define double @test(double %tx) {
+;CHECK-LABEL: test:
+  %call = tail call double @fabs(double %tx)
+  ret double %call
+;CHECK-VABS: vabs.f64
+;CHECK-BFC: bfc
+}
+
+declare double @fabs(double) readnone
+
diff --git a/test/CodeGen/ARM/fp16-args.ll b/test/CodeGen/ARM/fp16-args.ll
index 31a20f85483bd..708fae7f9ffa9 100644
--- a/test/CodeGen/ARM/fp16-args.ll
+++ b/test/CodeGen/ARM/fp16-args.ll
@@ -32,9 +32,10 @@ entry:
 ; HARD: vcvtb.f32.f16   {{s[0-9]+}}, s1
 ; HARD: vcvtb.f32.f16   {{s[0-9]+}}, s0
 ; HARD: vadd.f32        {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; HARD: vcvtb.f16.f32   s0, {{s[0-9]+}}
-; HARD-NOT: vmov
-; HARD-NOT: uxth
+; HARD: vcvtb.f16.f32   [[SREG:s[0-9]+]], {{s[0-9]+}}
+; HARD-NEXT: vmov            [[REG0:r[0-9]+]], [[SREG]]
+; HARD-NEXT: uxth            [[REG1:r[0-9]+]], [[REG0]]
+; HARD-NEXT: vmov            s0, [[REG1]]
 
 ; CHECK: bx lr
 }
diff --git a/test/CodeGen/ARM/fp16-v3.ll b/test/CodeGen/ARM/fp16-v3.ll
new file mode 100644
index 0000000000000..6ed9c9d22c9d2
--- /dev/null
+++ b/test/CodeGen/ARM/fp16-v3.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7a--none-eabi"
+
+; CHECK-LABEL: test_vec3:
+; CHECK: vcvtb.f32.f16
+; CHECK: vcvt.f32.s32
+; CHECK: vadd.f32
+; CHECK-NEXT: vcvtb.f16.f32 [[SREG:s[0-9]+]], {{.*}}
+; CHECK-NEXT: vmov [[RREG1:r[0-9]+]], [[SREG]]
+; CHECK-NEXT: uxth [[RREG2:r[0-9]+]], [[RREG1]]
+; CHECK-NEXT: pkhbt [[RREG3:r[0-9]+]], [[RREG1]], [[RREG1]], lsl #16
+; CHECK-DAG: strh [[RREG1]], [r0, #4]
+; CHECK-DAG: vmov [[DREG:d[0-9]+]], [[RREG3]], [[RREG2]]
+; CHECK-DAG: vst1.32 {[[DREG]][0]}, [r0:32]
+; CHECK-NEXT: bx lr
+define void @test_vec3(<3 x half>* %arr, i32 %i) #0 {
+  %H = sitofp i32 %i to half
+  %S = fadd half %H, 0xH4A00
+  %1 = insertelement <3 x half> undef, half %S, i32 0
+  %2 = insertelement <3 x half> %1, half %S, i32 1
+  %3 = insertelement <3 x half> %2, half %S, i32 2
+  store <3 x half> %3, <3 x half>* %arr, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/ARM/inlineasm-imm-thumb.ll b/test/CodeGen/ARM/inlineasm-imm-thumb.ll
new file mode 100644
index 0000000000000..80be870743f50
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-imm-thumb.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=thumbv5-none-linux-gnueabi -no-integrated-as %s -o /dev/null
+
+; Test thumb-mode "I" constraint, for any Data Processing immediate.
+define void @testI() {
+	tail call void asm sideeffect ".word $0", "I"( i32 255 ) nounwind
+	ret void
+}
+
+; Test thumb-mode "J" constraint, for compatibility with unknown use in GCC.
+define void @testJ() {
+	tail call void asm sideeffect ".word $0", "J"( i32 -254 ) nounwind
+	ret void
+}
+
+; Test thumb-mode "L" constraint, for negated Data Processing immediates.
+define void @testL() {
+	tail call void asm sideeffect ".word $0", "L"( i32 -7 ) nounwind
+	ret void
+}
+
diff --git a/test/CodeGen/ARM/inlineasm-imm-thumb2.ll b/test/CodeGen/ARM/inlineasm-imm-thumb2.ll
new file mode 100644
index 0000000000000..c54f3b8aa5f94
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-imm-thumb2.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=thumbv7-linux-gnu -no-integrated-as %s -o /dev/null
+
+; Test thumb2-mode "I" constraint, for any Data Processing immediate.
+define i32 @testI(i32 %x) {
+	%y = call i32 asm "add $0, $1, $2", "=r,r,I"( i32 %x, i32 65280 ) nounwind
+	ret i32 %y
+}
+
+; Test thumb2-mode "J" constraint, for compatibility with unknown use in GCC.
+define void @testJ() {
+	tail call void asm sideeffect ".word $0", "J"( i32 4080 ) nounwind
+	ret void
+}
+
+; Test thumb2-mode "K" constraint, for bitwise inverted Data Processing immediates.
+define void @testK() {
+	tail call void asm sideeffect ".word $0", "K"( i32 16777215 ) nounwind
+	ret void
+}
+
+; Test thumb2-mode "L" constraint, for negated Data Processing immediates.
+define void @testL() {
+	tail call void asm sideeffect ".word $0", "L"( i32 -65280 ) nounwind
+	ret void
+}
+
+; Test thumb2-mode "M" constraint, for value between 0 and 32.
+define i32 @testM(i32 %x) {
+	%y = call i32 asm "lsl $0, $1, $2", "=r,r,M"( i32 %x, i32 31 ) nounwind
+	ret i32 %y
+}
diff --git a/test/CodeGen/ARM/zero-cycle-zero.ll b/test/CodeGen/ARM/zero-cycle-zero.ll
index 121a87f5b84d2..4e8696f4418ac 100644
--- a/test/CodeGen/ARM/zero-cycle-zero.ll
+++ b/test/CodeGen/ARM/zero-cycle-zero.ll
@@ -1,26 +1,19 @@
-; RUN: llc -mtriple=armv8 -mcpu=cyclone < %s | FileCheck %s --check-prefix=CHECK-CYCLONE
-; RUN: llc -mtriple=armv8 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-SWIFT
+; RUN: llc -mtriple=armv8 -mcpu=cyclone < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTSWIFT
+; RUN: llc -mtriple=armv8 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=armv8 -mcpu=cortex-a57 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTSWIFT
 
 declare arm_aapcs_vfpcc void @take_vec64(<2 x i32>)
 
 define void @test_vec64() {
-; CHECK-CYCLONE-LABEL: test_vec64:
-; CHECK-SWIFT-LABEL: test_vec64:
+; CHECK-LABEL: test_vec64:
 
   call arm_aapcs_vfpcc void @take_vec64(<2 x i32> <i32 0, i32 0>)
   call arm_aapcs_vfpcc void @take_vec64(<2 x i32> <i32 0, i32 0>)
-; CHECK-CYCLONE-NOT: vmov.f64 d0,
-; CHECK-CYCLONE: vmov.i32 d0, #0
-; CHECK-CYCLONE: bl
-; CHECK-CYCLONE: vmov.i32 d0, #0
-; CHECK-CYCLONE: bl
-
-; CHECK-SWIFT: vmov.f64 [[ZEROREG:d[0-9]+]],
-; CHECK-SWIFT: vmov.i32 [[ZEROREG]], #0
-; CHECK-SWIFT: vorr d0, [[ZEROREG]], [[ZEROREG]]
-; CHECK-SWIFT: bl
-; CHECK-SWIFT: vorr d0, [[ZEROREG]], [[ZEROREG]]
-; CHECK-SWIFT: bl
+; CHECK-NOTSWIFT-NOT: vmov.f64 d0,
+; CHECK: vmov.i32 d0, #0
+; CHECK: bl
+; CHECK: vmov.i32 d0, #0
+; CHECK: bl
 
   ret void
 }
@@ -28,23 +21,15 @@ define void @test_vec64() {
 declare arm_aapcs_vfpcc void @take_vec128(<8 x i16>)
 
 define void @test_vec128() {
-; CHECK-CYCLONE-LABEL: test_vec128:
-; CHECK-SWIFT-LABEL: test_vec128:
+; CHECK-LABEL: test_vec128:
 
   call arm_aapcs_vfpcc void @take_vec128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
   call arm_aapcs_vfpcc void @take_vec128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
-; CHECK-CYCLONE-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
-; CHECK-CYCLONE: vmov.i32 q0, #0
-; CHECK-CYCLONE: bl
-; CHECK-CYCLONE: vmov.i32 q0, #0
-; CHECK-CYCLONE: bl
-
-; CHECK-SWIFT-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
-; CHECK-SWIFT: vmov.i32 [[ZEROREG:q[0-9]+]], #0
-; CHECK-SWIFT: vorr q0, [[ZEROREG]], [[ZEROREG]]
-; CHECK-SWIFT: bl
-; CHECK-SWIFT: vorr q0, [[ZEROREG]], [[ZEROREG]]
-; CHECK-SWIFT: bl
+; CHECK-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
+; CHECK: vmov.i32 q0, #0
+; CHECK: bl
+; CHECK: vmov.i32 q0, #0
+; CHECK: bl
 
   ret void
 }
@@ -52,16 +37,15 @@ define void @test_vec128() {
 declare void @take_i32(i32)
 
 define void @test_i32() {
-; CHECK-CYCLONE-LABEL: test_i32:
-; CHECK-SWIFT-LABEL: test_i32:
+; CHECK-LABEL: test_i32:
 
   call arm_aapcs_vfpcc void @take_i32(i32 0)
   call arm_aapcs_vfpcc void @take_i32(i32 0)
-; CHECK-CYCLONE-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
-; CHECK-CYCLONE: mov r0, #0
-; CHECK-CYCLONE: bl
-; CHECK-CYCLONE: mov r0, #0
-; CHECK-CYCLONE: bl
+; CHECK-NOTSWIFT-NOT: vmov.f64 [[ZEROREG:d[0-9]+]],
+; CHECK: mov r0, #0
+; CHECK: bl
+; CHECK: mov r0, #0
+; CHECK: bl
 
 ; It doesn't particularly matter what Swift does here, there isn't carefully
 ; crafted behaviour that we might break in Cyclone.
diff --git a/test/CodeGen/Hexagon/bit-phi.ll b/test/CodeGen/Hexagon/bit-phi.ll
new file mode 100644
index 0000000000000..86b18d8bf2563
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-phi.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon-unknown--elf"
+
+%struct.item = type { i32, i8*, i8*, i32, i8, i8, i16, i32, i8, i16, i32 }
+
+declare %struct.item* @foo(%struct.item*, i8*, i32) #1
+
+; Function Attrs: nounwind
+define i32 @bar(%struct.item** %ptr, i8* %buf, i32 %c, i8* %d, i32 %e) #1 {
+entry:
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %while.cond13.preheader, label %if.end3
+
+if.end3:                                          ; preds = %if.end
+  br label %while.cond13.preheader
+
+while.cond13.preheader:                           ; preds = %if.end3, %if.end
+  br i1 undef, label %while.body20, label %return
+
+while.body20:                                     ; preds = %if.end38, %while.cond13.preheader
+  %addr.0100 = phi i32 [ undef, %if.end38 ], [ %c, %while.cond13.preheader ]
+  %cond = select i1 undef, i32 %addr.0100, i32 undef
+  br i1 undef, label %while.body20.if.end38_crit_edge, label %if.then32
+
+while.body20.if.end38_crit_edge:                  ; preds = %while.body20
+  %conv39.pre = and i32 %cond, 65535
+  br label %if.end38
+
+if.then32:                                        ; preds = %while.body20
+  %conv33 = and i32 %cond, 65535
+  %.pre = load %struct.item*, %struct.item** %ptr, align 4, !tbaa !1
+  br label %if.end38
+
+if.end38:                                         ; preds = %if.then32, %while.body20.if.end38_crit_edge
+  %conv39.pre-phi = phi i32 [ %conv39.pre, %while.body20.if.end38_crit_edge ], [ %conv33, %if.then32 ]
+  %0 = phi %struct.item* [ undef, %while.body20.if.end38_crit_edge ], [ %.pre, %if.then32 ]
+  %add = add i32 %conv39.pre-phi, 0
+  %call52 = tail call %struct.item* @foo(%struct.item* %0, i8* %d, i32 %e) #1
+  br i1 undef, label %while.body20, label %return
+
+return:                                           ; preds = %if.end38, %while.cond13.preheader, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 0, %while.cond13.preheader ], [ %add, %if.end38 ]
+  ret i32 %retval.0
+}
+
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/postinc-offset.ll b/test/CodeGen/Hexagon/postinc-offset.ll
index 5e0f4751f3056..cf8c4e5f71d25 100644
--- a/test/CodeGen/Hexagon/postinc-offset.ll
+++ b/test/CodeGen/Hexagon/postinc-offset.ll
@@ -1,4 +1,5 @@
-; RUN: llc -enable-aa-sched-mi -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
+; RUN: llc -enable-aa-sched-mi -march=hexagon -mcpu=hexagonv5 -rdf-opt=0 \
+; RUN:      < %s | FileCheck %s
 
 ; CHECK: {
 ; CHECK: ={{ *}}memd([[REG0:(r[0-9]+)]]{{ *}}++{{ *}}#8)
diff --git a/test/CodeGen/Hexagon/rdf-copy.ll b/test/CodeGen/Hexagon/rdf-copy.ll
new file mode 100644
index 0000000000000..96153ca31fa4d
--- /dev/null
+++ b/test/CodeGen/Hexagon/rdf-copy.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; 
+; Check that
+;     {
+;         r1 = r0
+;     }
+;     {
+;         r0 = memw(r1 + #0)
+;     }
+; was copy-propagated to
+;     {
+;         r1 = r0
+;         r0 = memw(r0 + #0)
+;     }
+;
+; CHECK-LABEL: LBB0_1
+; CHECK: [[DST:r[0-9]+]] = [[SRC:r[0-9]+]]
+; CHECK-DAG: memw([[SRC]]
+; CHECK-DAG-NOT: memw([[DST]]
+; CHECK-LABEL: LBB0_2
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%union.t = type { %struct.t, [64 x i8] }
+%struct.t = type { [12 x i8], %struct.r*, double }
+%struct.r = type opaque
+
+define %union.t* @foo(%union.t* %chain) nounwind readonly {
+entry:
+  %tobool = icmp eq %union.t* %chain, null
+  br i1 %tobool, label %if.end, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %entry
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond.preheader, %while.cond
+  %chain.addr.0 = phi %union.t* [ %0, %while.cond ], [ %chain, %while.cond.preheader ]
+  %chain1 = bitcast %union.t* %chain.addr.0 to %union.t**
+  %0 = load %union.t*, %union.t** %chain1, align 4, !tbaa !0
+  %tobool2 = icmp eq %union.t* %0, null
+  br i1 %tobool2, label %if.end.loopexit, label %while.cond
+
+if.end.loopexit:                                  ; preds = %while.cond
+  br label %if.end
+
+if.end:                                           ; preds = %if.end.loopexit, %entry
+  %chain.addr.1 = phi %union.t* [ null, %entry ], [ %chain.addr.0, %if.end.loopexit ]
+  ret %union.t* %chain.addr.1
+}
+
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/rdf-dead-loop.ll b/test/CodeGen/Hexagon/rdf-dead-loop.ll
new file mode 100644
index 0000000000000..3762c79d4f571
--- /dev/null
+++ b/test/CodeGen/Hexagon/rdf-dead-loop.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK-NOT: ={{.*}}add
+; CHECK-NOT: mem{{[bdhwu]}}
+
+define void @main() #0 {
+entry:
+  br label %body
+
+body:
+  %ip_vec30 = phi <2 x i32> [ %ip_vec, %body ], [ zeroinitializer, %entry ]
+  %scevgep.phi = phi i32* [ %scevgep.inc, %body ], [ undef, %entry ]
+  %polly.indvar = phi i32 [ %polly.indvar_next, %body ], [ 0, %entry ]
+  %vector_ptr = bitcast i32* %scevgep.phi to <2 x i32>*
+  %_p_vec_full = load <2 x i32>, <2 x i32>* %vector_ptr, align 8
+  %ip_vec = add <2 x i32> %_p_vec_full, %ip_vec30
+  %polly.indvar_next = add nsw i32 %polly.indvar, 2
+  %polly.loop_cond = icmp slt i32 %polly.indvar, 4
+  %scevgep.inc = getelementptr i32, i32* %scevgep.phi, i32 2
+  br i1 %polly.loop_cond, label %body, label %exit
+
+exit:
+  %0 = extractelement <2 x i32> %ip_vec, i32 1
+  ret void
+
+}
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Mips/llvm-ir/call.ll b/test/CodeGen/Mips/llvm-ir/call.ll
index 22a44da0b0696..a4b03405f72b4 100644
--- a/test/CodeGen/Mips/llvm-ir/call.ll
+++ b/test/CodeGen/Mips/llvm-ir/call.ll
@@ -182,3 +182,18 @@ define hidden void @thunk_undef_double(i32 %this, double %volume) unnamed_addr a
   tail call void @undef_double(i32 undef, double undef) #8
   ret void
 }
+
+; Check that immediate addresses do not use jal.
+define i32 @jal_only_allows_symbols() {
+; ALL-LABEL: jal_only_allows_symbols:
+
+; ALL-NOT:       {{jal }}
+; ALL:           addiu $[[TGT:[0-9]+]], $zero, 1234
+; ALL-NOT:       {{jal }}
+; ALL:           jalr $[[TGT]]
+; ALL-NOT:       {{jal }}
+
+  call void () inttoptr (i32 1234 to void ()*)()
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index b84d94d31494c..667676de5f337 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -18,7 +18,7 @@
 ; 32-DAG:        [[m]]flo $3
 
 ; DSP-DAG:       sra $[[T0:[0-9]+]], $6, 31
-; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       mtlo $6, $[[AC:ac[0-3]+]]
 ; DSP-DAG:       madd $[[AC]], ${{[45]}}, ${{[45]}}
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
@@ -64,7 +64,7 @@ entry:
 ; 32-DAG:        [[m]]flo $3
 
 ; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
-; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       mtlo $6, $[[AC:ac[0-3]+]]
 ; DSP-DAG:       maddu $[[AC]], ${{[45]}}, ${{[45]}}
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
@@ -101,8 +101,8 @@ entry:
 ; 32-DAG:        [[m]]fhi $2
 ; 32-DAG:        [[m]]flo $3
 
-; DSP-DAG:       mthi $[[AC:ac[0-3]+]], $6
-; DSP-DAG:       mtlo $[[AC]], $7
+; DSP-DAG:       mthi $6, $[[AC:ac[0-3]+]]
+; DSP-DAG:       mtlo $7, $[[AC]]
 ; DSP-DAG:       madd $[[AC]], ${{[45]}}, ${{[45]}}
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
@@ -143,7 +143,7 @@ entry:
 ; 32-DAG:        [[m]]flo $3
 
 ; DSP-DAG:       sra $[[T0:[0-9]+]], $6, 31
-; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       mtlo $6, $[[AC:ac[0-3]+]]
 ; DSP-DAG:       msub $[[AC]], ${{[45]}}, ${{[45]}}
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
@@ -189,7 +189,7 @@ entry:
 ; 32-DAG:        [[m]]flo $3
 
 ; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
-; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       mtlo $6, $[[AC:ac[0-3]+]]
 ; DSP-DAG:       msubu $[[AC]], ${{[45]}}, ${{[45]}}
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
@@ -229,7 +229,7 @@ entry:
 ; 32-DAG:        [[m]]flo $3
 
 ; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
-; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       mtlo $6, $[[AC:ac[0-3]+]]
 ; DSP-DAG:       msub $[[AC]], ${{[45]}}, ${{[45]}}
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
diff --git a/test/CodeGen/PowerPC/2016-01-07-BranchWeightCrash.ll b/test/CodeGen/PowerPC/2016-01-07-BranchWeightCrash.ll
new file mode 100644
index 0000000000000..65dff12f31156
--- /dev/null
+++ b/test/CodeGen/PowerPC/2016-01-07-BranchWeightCrash.ll
@@ -0,0 +1,35 @@
+; RUN: llc <%s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%struct.buffer_t = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [2 x i8] }
+
+declare i32 @__f1(i8*, %struct.buffer_t* noalias)
+
+; CHECK-LABEL: f1:
+define i32 @f1(i8* %__user_context, %struct.buffer_t* noalias %f1.buffer) {
+entry:
+  br i1 undef, label %"assert succeeded", label %"assert failed", !prof !1
+
+"assert failed":                                  ; preds = %entry
+  br label %destructor_block
+
+"assert succeeded":                               ; preds = %entry
+  %__f1_result = call i32 @__f1(i8* %__user_context, %struct.buffer_t* %f1.buffer) #5
+  %0 = icmp eq i32 %__f1_result, 0
+  br i1 %0, label %"assert succeeded11", label %"assert failed10", !prof !1
+
+destructor_block:                                 ; preds = %"assert succeeded11", %"assert failed10", %"assert failed"
+  %1 = phi i32 [ undef, %"assert failed" ], [ %__f1_result, %"assert failed10" ], [ 0, %"assert succeeded11" ]
+  ret i32 %1
+
+"assert failed10":                                ; preds = %"assert succeeded"
+  br label %destructor_block
+
+"assert succeeded11":                             ; preds = %"assert succeeded"
+  br label %destructor_block
+}
+
+attributes #5 = { nounwind }
+
+!1 = !{!"branch_weights", i32 1073741824, i32 0}
diff --git a/test/CodeGen/PowerPC/ppc64le-localentry-large.ll b/test/CodeGen/PowerPC/ppc64le-localentry-large.ll
new file mode 100644
index 0000000000000..4cf1b5a233d52
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-localentry-large.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=ppc64le -code-model=large < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@number64 = global i64 10, align 8
+
+; CHECK: .abiversion 2
+
+define i64 @use_toc(i64 %a) nounwind {
+entry:
+; CHECK: .Lfunc_toc[[FN:[0-9]+]]:
+; CHECK-NEXT: .quad .TOC.-.Lfunc_gep[[FN]]
+; CHECK: use_toc:
+; CHECK-NEXT: .L{{.*}}:
+; CHECK-NEXT: .Lfunc_gep[[FN]]:
+; CHECK-NEXT: ld 2, .Lfunc_toc[[FN]]-.Lfunc_gep[[FN]](12)
+; CHECK-NEXT: add 2, 2, 12
+; CHECK-NEXT: .Lfunc_lep[[FN]]:
+; CHECK-NEXT: .localentry use_toc, .Lfunc_lep[[FN]]-.Lfunc_gep[[FN]]
+; CHECK-NEXT: %entry
+  %0 = load i64, i64* @number64, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64le-localentry.ll b/test/CodeGen/PowerPC/ppc64le-localentry.ll
index be64f1151769d..45cae6b17bc22 100644
--- a/test/CodeGen/PowerPC/ppc64le-localentry.ll
+++ b/test/CodeGen/PowerPC/ppc64le-localentry.ll
@@ -17,11 +17,11 @@ define i64 @use_toc(i64 %a) nounwind {
 entry:
 ; CHECK-LABEL: @use_toc
 ; CHECK-NEXT: .L{{.*}}:
-; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]:
-; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha
-; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l
-; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]:
-; CHECK-NEXT: .localentry use_toc, .Ltmp[[TMP2]]-.Ltmp[[TMP1]]
+; CHECK-NEXT: .Lfunc_gep[[FN:[0-9]+]]:
+; CHECK-NEXT: addis 2, 12, .TOC.-.Lfunc_gep[[FN]]@ha
+; CHECK-NEXT: addi 2, 2, .TOC.-.Lfunc_gep[[FN]]@l
+; CHECK-NEXT: .Lfunc_lep[[FN]]:
+; CHECK-NEXT: .localentry use_toc, .Lfunc_lep[[FN]]-.Lfunc_gep[[FN]]
 ; CHECK-NEXT: %entry
   %0 = load i64, i64* @number64, align 8
   %cmp = icmp eq i64 %0, %a
@@ -34,11 +34,11 @@ define void @use_toc_implicit() nounwind {
 entry:
 ; CHECK-LABEL: @use_toc_implicit
 ; CHECK-NEXT: .L{{.*}}:
-; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]:
-; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha
-; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l
-; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]:
-; CHECK-NEXT: .localentry use_toc_implicit, .Ltmp[[TMP2]]-.Ltmp[[TMP1]]
+; CHECK-NEXT: .Lfunc_gep[[FN:[0-9]+]]:
+; CHECK-NEXT: addis 2, 12, .TOC.-.Lfunc_gep[[FN]]@ha
+; CHECK-NEXT: addi 2, 2, .TOC.-.Lfunc_gep[[FN]]@l
+; CHECK-NEXT: .Lfunc_lep[[FN]]:
+; CHECK-NEXT: .localentry use_toc_implicit, .Lfunc_lep[[FN]]-.Lfunc_gep[[FN]]
 ; CHECK-NEXT: %entry
   call void @callee()
   ret void
diff --git a/test/CodeGen/PowerPC/pr25802.ll b/test/CodeGen/PowerPC/pr25802.ll
new file mode 100644
index 0000000000000..0631850be5fa6
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr25802.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s | FileCheck %s
+; CHECK: .long   .Ltmp6-.Ltmp12          #   Call between .Ltmp12 and .Ltmp6
+
+; We used to crash in filetype=obj when computing a negative value.
+; RUN: llc -filetype=obj < %s
+
+target triple = "powerpc--netbsd"
+@_ZTI1I = external constant { i8*, i8* }
+define void @f(i8 %foo, i32 %bar) personality i8* bitcast (void ()* @g to i8*) {
+  invoke void @g()
+          to label %try.cont unwind label %lpad
+lpad:                                             ; preds = %0
+  %tmp = landingpad { i8*, i32 }
+          catch i8* bitcast ({ i8*, i8* }* @_ZTI1I to i8*)
+  br i1 undef, label %catch10, label %catch
+catch10:                                          ; preds = %lpad
+  %tmp8 = load i32, i32* undef, align 4
+  %conv.i.i = zext i8 %foo to i32
+  %cond.i.i = select i1 undef, i32 %conv.i.i, i32 %tmp8
+  invoke void @_Z24__put_character_sequenceIccEvR1AIT_T0_Ej(i32 %cond.i.i)
+          to label %invoke.cont20 unwind label %lpad15
+invoke.cont20:                                    ; preds = %catch10
+  ret void
+try.cont:                                         ; preds = %0
+  ret void
+catch:                                            ; preds = %lpad
+  %tmp14 = load i32, i32* undef, align 4
+  %conv.i.i34 = zext i8 %foo to i32
+  %cond.i.i35 = select i1 undef, i32 %conv.i.i34, i32 %tmp14
+  invoke void @_Z24__put_character_sequenceIccEvR1AIT_T0_Ej(i32 %cond.i.i35)
+          to label %invoke.cont8 unwind label %lpad3
+invoke.cont8:                                     ; preds = %call2.i.i.noexc36
+  ret void
+lpad3:                                            ; preds = %call2.i.i.noexc36, %catch
+  %tmp16 = landingpad { i8*, i32 }
+          cleanup
+  invoke void @g()
+          to label %eh.resume unwind label %terminate.lpad
+lpad15:                                           ; preds = %catch10
+  %tmp19 = landingpad { i8*, i32 }
+          cleanup
+  invoke void @g()
+          to label %eh.resume unwind label %terminate.lpad
+eh.resume:                                        ; preds = %lpad15, %lpad3
+  ret void
+terminate.lpad:                                   ; preds = %lpad15, %lpad3
+  %tmp22 = landingpad { i8*, i32 }
+          catch i8* null
+  ret void
+}
+declare void @g()
+declare void @_Z24__put_character_sequenceIccEvR1AIT_T0_Ej(i32)
diff --git a/test/CodeGen/PowerPC/tls_get_addr_clobbers.ll b/test/CodeGen/PowerPC/tls_get_addr_clobbers.ll
new file mode 100644
index 0000000000000..400a1f297f004
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls_get_addr_clobbers.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple="powerpc64le-unknown-linux-gnu" -relocation-model=pic < %s | FileCheck %s
+
+@a = thread_local global i32* null, align 8
+
+define void @test_foo(i32* nocapture %x01, i32* nocapture %x02, i32* nocapture %x03, i32* nocapture %x04, i32* nocapture %x05, i32* nocapture %x06, i32* nocapture %x07, i32* nocapture %x08) #0 {
+entry:
+
+; CHECK-LABEL: test_foo:
+; CHECK: stdu 1, {{-?[0-9]+}}(1)
+; CHECK-DAG: mr [[BACKUP_3:[0-9]+]], 3
+; CHECK-DAG: mr [[BACKUP_4:[0-9]+]], 4
+; CHECK-DAG: mr [[BACKUP_5:[0-9]+]], 5
+; CHECK-DAG: mr [[BACKUP_6:[0-9]+]], 6
+; CHECK-DAG: mr [[BACKUP_7:[0-9]+]], 7
+; CHECK-DAG: mr [[BACKUP_8:[0-9]+]], 8
+; CHECK-DAG: mr [[BACKUP_9:[0-9]+]], 9
+; CHECK-DAG: mr [[BACKUP_10:[0-9]+]], 10
+; CHECK-DAG: std [[BACKUP_3]], {{[0-9]+}}(1)
+; CHECK-DAG: std [[BACKUP_4]], {{[0-9]+}}(1)
+; CHECK-DAG: std [[BACKUP_5]], {{[0-9]+}}(1)
+; CHECK-DAG: std [[BACKUP_6]], {{[0-9]+}}(1)
+; CHECK-DAG: std [[BACKUP_7]], {{[0-9]+}}(1)
+; CHECK-DAG: std [[BACKUP_8]], {{[0-9]+}}(1)
+; CHECK-DAG: std [[BACKUP_9]], {{[0-9]+}}(1)
+; CHECK-DAG: std [[BACKUP_10]], {{[0-9]+}}(1)
+; CHECK: bl __tls_get_addr
+; CHECK-DAG: stw 3, 0([[BACKUP_3]])
+; CHECK-DAG: stw 3, 0([[BACKUP_4]])
+; CHECK-DAG: stw 3, 0([[BACKUP_5]])
+; CHECK-DAG: stw 3, 0([[BACKUP_6]])
+; CHECK-DAG: stw 3, 0([[BACKUP_7]])
+; CHECK-DAG: stw 3, 0([[BACKUP_8]])
+; CHECK-DAG: stw 3, 0([[BACKUP_9]])
+; CHECK-DAG: stw 3, 0([[BACKUP_10]])
+; CHECK: blr
+
+  %0 = load i32*, i32** @a, align 8
+  %cmp = icmp eq i32* %0, null
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  store i32 0, i32* %x01, align 4
+  store i32 0, i32* %x02, align 4
+  store i32 0, i32* %x03, align 4
+  store i32 0, i32* %x04, align 4
+  store i32 0, i32* %x05, align 4
+  store i32 0, i32* %x06, align 4
+  store i32 0, i32* %x07, align 4
+  store i32 0, i32* %x08, align 4
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/tls_get_addr_stackframe.ll b/test/CodeGen/PowerPC/tls_get_addr_stackframe.ll
new file mode 100644
index 0000000000000..4a235983e6f7b
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls_get_addr_stackframe.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple="powerpc64le-unknown-linux-gnu" -relocation-model=pic < %s | FileCheck %s
+; CHECK-LABEL: foo_test:
+; CHECK: mflr 0
+; CHECK: __tls_get_addr
+
+%struct1.2.41 = type { %struct2.0.39, %struct3.1.40, %struct1.2.41* }
+%struct2.0.39 = type { i64, i32, i32, i32, i32 }
+%struct3.1.40 = type { [160 x i8] }
+
+@tls_var = external thread_local global %struct1.2.41*, align 8
+
+define void @foo_test() {
+  %1 = load %struct1.2.41*, %struct1.2.41** @tls_var, align 8
+  br i1 undef, label %foo.exit, label %2
+
+; <label>:2                                       ; preds = %0
+  br i1 undef, label %foo.exit, label %3
+
+; <label>:3                                       ; preds = %2
+  %4 = getelementptr inbounds %struct1.2.41, %struct1.2.41* %1, i64 0, i32 0, i32 3
+  %5 = load i32, i32* %4, align 8
+  %6 = add nsw i32 %5, -1
+  %7 = icmp eq i32 %6, 0
+  br i1 %7, label %8, label %foo.exit
+
+; <label>:8                                       ; preds = %3
+  tail call void undef(%struct1.2.41* undef, %struct1.2.41* nonnull undef)
+  br label %foo.exit
+
+foo.exit:                                         ; preds = %8, %3, %2, %0
+  ret void
+}
diff --git a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
index 29bca67e2d24d..b38c955e1f63c 100644
--- a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
+++ b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
@@ -59,7 +59,7 @@ entry:
 ;CHECK:      sethi
 ;CHECK:      !NO_APP
 ;CHECK-NEXT: cmp
-;CHECK-NEXT: bg
+;CHECK-NEXT: ble
 ;CHECK-NEXT: mov
   tail call void asm sideeffect "sethi 0, %g0", ""() nounwind
   %0 = icmp slt i32 %a, 0
diff --git a/test/CodeGen/SPARC/analyze-branch.ll b/test/CodeGen/SPARC/analyze-branch.ll
new file mode 100644
index 0000000000000..7d2096033a03c
--- /dev/null
+++ b/test/CodeGen/SPARC/analyze-branch.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=sparc-none-linux-gnu < %s | FileCheck %s
+
+; This test checks that LLVM can do basic stripping and reapplying of branches
+; to basic blocks.
+
+declare void @test_true()
+declare void @test_false()
+
+; !0 corresponds to a branch being taken, !1 to not being takne.
+!0 = !{!"branch_weights", i32 64, i32 4}
+!1 = !{!"branch_weights", i32 4, i32 64}
+
+define void @test_Bcc_fallthrough_taken(i32 %in) nounwind {
+; CHECK-LABEL: test_Bcc_fallthrough_taken:
+  %tst = icmp eq i32 %in, 42
+  br i1 %tst, label %true, label %false, !prof !0
+
+; CHECK: cmp {{%[goli][0-9]+}}, 42
+; CHECK: bne [[FALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: nop
+; CHECK-NEXT: ! BB#
+; CHECK-NEXT: call test_true
+
+; CHECK: [[FALSE]]:
+; CHECK: call test_false
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_Bcc_fallthrough_nottaken(i32 %in) nounwind {
+; CHECK-LABEL: test_Bcc_fallthrough_nottaken:
+  %tst = icmp eq i32 %in, 42
+  br i1 %tst, label %true, label %false, !prof !1
+
+; CHECK: cmp {{%[goli][0-9]+}}, 42
+
+; CHECK: be [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: nop
+; CHECK-NEXT: ! BB#
+; CHECK-NEXT: call test_false
+
+; CHECK: [[TRUE]]:
+; CHECK: call test_true
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/call.ll b/test/CodeGen/WebAssembly/call.ll
index 9158ccec09798..6d5542c89d3d4 100644
--- a/test/CodeGen/WebAssembly/call.ll
+++ b/test/CodeGen/WebAssembly/call.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic call operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare i32 @i32_nullary()
@@ -15,7 +15,7 @@ declare void @void_nullary()
 
 ; CHECK-LABEL: call_i32_nullary:
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_nullary{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @call_i32_nullary() {
   %r = call i32 @i32_nullary()
@@ -24,7 +24,7 @@ define i32 @call_i32_nullary() {
 
 ; CHECK-LABEL: call_i64_nullary:
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: {{^}} i64.call $push[[NUM:[0-9]+]]=, i64_nullary{{$}}
+; CHECK-NEXT: {{^}} i64.call $push[[NUM:[0-9]+]]=, i64_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @call_i64_nullary() {
   %r = call i64 @i64_nullary()
@@ -33,7 +33,7 @@ define i64 @call_i64_nullary() {
 
 ; CHECK-LABEL: call_float_nullary:
 ; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: {{^}} f32.call $push[[NUM:[0-9]+]]=, float_nullary{{$}}
+; CHECK-NEXT: {{^}} f32.call $push[[NUM:[0-9]+]]=, float_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @call_float_nullary() {
   %r = call float @float_nullary()
@@ -42,7 +42,7 @@ define float @call_float_nullary() {
 
 ; CHECK-LABEL: call_double_nullary:
 ; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: {{^}} f64.call $push[[NUM:[0-9]+]]=, double_nullary{{$}}
+; CHECK-NEXT: {{^}} f64.call $push[[NUM:[0-9]+]]=, double_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @call_double_nullary() {
   %r = call double @double_nullary()
@@ -50,7 +50,7 @@ define double @call_double_nullary() {
 }
 
 ; CHECK-LABEL: call_void_nullary:
-; CHECK-NEXT: {{^}} call void_nullary{{$}}
+; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @call_void_nullary() {
   call void @void_nullary()
@@ -60,7 +60,7 @@ define void @call_void_nullary() {
 ; CHECK-LABEL: call_i32_unary:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_unary, $0{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_unary@FUNCTION, $0{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @call_i32_unary(i32 %a) {
   %r = call i32 @i32_unary(i32 %a)
@@ -70,7 +70,7 @@ define i32 @call_i32_unary(i32 %a) {
 ; CHECK-LABEL: call_i32_binary:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_binary, $0, $1{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_binary@FUNCTION, $0, $1{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @call_i32_binary(i32 %a, i32 %b) {
   %r = call i32 @i32_binary(i32 %a, i32 %b)
@@ -97,7 +97,7 @@ define i32 @call_indirect_i32(i32 ()* %callee) {
 }
 
 ; CHECK-LABEL: tail_call_void_nullary:
-; CHECK-NEXT: {{^}} call void_nullary{{$}}
+; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @tail_call_void_nullary() {
   tail call void @void_nullary()
@@ -105,7 +105,7 @@ define void @tail_call_void_nullary() {
 }
 
 ; CHECK-LABEL: fastcc_tail_call_void_nullary:
-; CHECK-NEXT: {{^}} call void_nullary{{$}}
+; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @fastcc_tail_call_void_nullary() {
   tail call fastcc void @void_nullary()
@@ -113,7 +113,7 @@ define void @fastcc_tail_call_void_nullary() {
 }
 
 ; CHECK-LABEL: coldcc_tail_call_void_nullary:
-; CHECK-NEXT: {{^}} call void_nullary
+; CHECK-NEXT: {{^}} call void_nullary@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @coldcc_tail_call_void_nullary() {
   tail call coldcc void @void_nullary()
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index 71f3551347bf5..f0e5f44716785 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -3,7 +3,7 @@
 
 ; Test the CFG stackifier pass.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare void @something()
@@ -18,7 +18,7 @@ declare void @something()
 ; CHECK-NEXT: br_if
 ; CHECK-NOT: br
 ; CHECK: call
-; CHECK: br BB0_1{{$}}
+; CHECK: br 0{{$}}
 ; CHECK: return{{$}}
 ; OPT-LABEL: test0:
 ; OPT: loop
@@ -28,7 +28,7 @@ declare void @something()
 ; OPT-NEXT: br_if
 ; OPT-NOT: br
 ; OPT: call
-; OPT: br BB0_1{{$}}
+; OPT: br 0{{$}}
 ; OPT: return{{$}}
 define void @test0(i32 %n) {
 entry:
@@ -59,7 +59,7 @@ back:
 ; CHECK-NEXT: br_if
 ; CHECK-NOT: br
 ; CHECK: call
-; CHECK: br BB1_1{{$}}
+; CHECK: br 0{{$}}
 ; CHECK: return{{$}}
 ; OPT-LABEL: test1:
 ; OPT: loop
@@ -69,7 +69,7 @@ back:
 ; OPT-NEXT: br_if
 ; OPT-NOT: br
 ; OPT: call
-; OPT: br BB1_1{{$}}
+; OPT: br 0{{$}}
 ; OPT: return{{$}}
 define void @test1(i32 %n) {
 entry:
@@ -93,18 +93,20 @@ back:
 ; Test that a simple loop is handled as expected.
 
 ; CHECK-LABEL: test2:
-; CHECK: block BB2_2{{$}}
-; CHECK: br_if {{[^,]*}}, BB2_2{{$}}
-; CHECK: BB2_1:
-; CHECK: br_if ${{[0-9]+}}, BB2_1{{$}}
-; CHECK: BB2_2:
+; CHECK-NOT: local
+; CHECK: block{{$}}
+; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: .LBB2_1:
+; CHECK: br_if ${{[0-9]+}}, 0{{$}}
+; CHECK: .LBB2_2:
 ; CHECK: return{{$}}
 ; OPT-LABEL: test2:
-; OPT: block BB2_2{{$}}
-; OPT: br_if {{[^,]*}}, BB2_2{{$}}
-; OPT: BB2_1:
-; OPT: br_if ${{[0-9]+}}, BB2_1{{$}}
-; OPT: BB2_2:
+; OPT-NOT: local
+; OPT: block{{$}}
+; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: .LBB2_1:
+; OPT: br_if ${{[0-9]+}}, 0{{$}}
+; OPT: .LBB2_2:
 ; OPT: return{{$}}
 define void @test2(double* nocapture %p, i32 %n) {
 entry:
@@ -132,24 +134,29 @@ for.end:
 }
 
 ; CHECK-LABEL: doublediamond:
-; CHECK: block BB3_5{{$}}
-; CHECK: block BB3_2{{$}}
-; CHECK: br_if $0, BB3_2{{$}}
-; CHECK: block BB3_4{{$}}
-; CHECK: br_if $1, BB3_4{{$}}
-; CHECK: br BB3_5{{$}}
-; CHECK: BB3_4:
-; CHECK: BB3_5:
+; CHECK: block{{$}}
+; CHECK-NEXT: block{{$}}
+; CHECK: br_if ${{[^,]*}}, 0{{$}}
+; CHECK: br 1{{$}}
+; CHECK: .LBB3_2:
+; CHECK-NEXT: end_block{{$}}
+; CHECK: block{{$}}
+; CHECK: br_if ${{[^,]*}}, 0{{$}}
+; CHECK: br 1{{$}}
+; CHECK: .LBB3_4:
+; CHECK-NEXT: end_block{{$}}
+; CHECK: .LBB3_5:
+; CHECK-NEXT: end_block{{$}}
 ; CHECK: return ${{[0-9]+}}{{$}}
 ; OPT-LABEL: doublediamond:
-; OPT: block BB3_5{{$}}
-; OPT: block BB3_4{{$}}
-; OPT: br_if {{[^,]*}}, BB3_4{{$}}
-; OPT: block BB3_3{{$}}
-; OPT: br_if {{[^,]*}}, BB3_3{{$}}
-; OPT: br BB3_5{{$}}
-; OPT: BB3_4:
-; OPT: BB3_5:
+; OPT: block{{$}}
+; OPT-NEXT: block{{$}}
+; OPT: br_if ${{[^,]*}}, 0{{$}}
+; OPT: block{{$}}
+; OPT: br_if ${{[^,]*}}, 0{{$}}
+; OPT: br 1{{$}}
+; OPT: .LBB3_4:
+; OPT: .LBB3_5:
 ; OPT: return ${{[0-9]+}}{{$}}
 define i32 @doublediamond(i32 %a, i32 %b, i32* %p) {
 entry:
@@ -175,14 +182,14 @@ exit:
 }
 
 ; CHECK-LABEL: triangle:
-; CHECK: block BB4_2{{$}}
-; CHECK: br_if $1, BB4_2{{$}}
-; CHECK: BB4_2:
+; CHECK: block{{$}}
+; CHECK: br_if $1, 0{{$}}
+; CHECK: .LBB4_2:
 ; CHECK: return ${{[0-9]+}}{{$}}
 ; OPT-LABEL: triangle:
-; OPT: block BB4_2{{$}}
-; OPT: br_if $1, BB4_2{{$}}
-; OPT: BB4_2:
+; OPT: block{{$}}
+; OPT: br_if $1, 0{{$}}
+; OPT: .LBB4_2:
 ; OPT: return ${{[0-9]+}}{{$}}
 define i32 @triangle(i32* %p, i32 %a) {
 entry:
@@ -198,20 +205,20 @@ exit:
 }
 
 ; CHECK-LABEL: diamond:
-; CHECK: block BB5_3{{$}}
-; CHECK: block BB5_2{{$}}
-; CHECK: br_if $1, BB5_2{{$}}
-; CHECK: br BB5_3{{$}}
-; CHECK: BB5_2:
-; CHECK: BB5_3:
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: br_if $1, 0{{$}}
+; CHECK: br 1{{$}}
+; CHECK: .LBB5_2:
+; CHECK: .LBB5_3:
 ; CHECK: return ${{[0-9]+}}{{$}}
 ; OPT-LABEL: diamond:
-; OPT: block BB5_3{{$}}
-; OPT: block BB5_2{{$}}
-; OPT: br_if {{[^,]*}}, BB5_2{{$}}
-; OPT: br BB5_3{{$}}
-; OPT: BB5_2:
-; OPT: BB5_3:
+; OPT: block{{$}}
+; OPT: block{{$}}
+; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br 1{{$}}
+; OPT: .LBB5_2:
+; OPT: .LBB5_3:
 ; OPT: return ${{[0-9]+}}{{$}}
 define i32 @diamond(i32* %p, i32 %a) {
 entry:
@@ -243,16 +250,16 @@ entry:
 
 ; CHECK-LABEL: minimal_loop:
 ; CHECK-NOT: br
-; CHECK: BB7_1:
+; CHECK: .LBB7_1:
 ; CHECK: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
-; CHECK: br BB7_1{{$}}
-; CHECK: BB7_2:
+; CHECK: br 0{{$}}
+; CHECK: .LBB7_2:
 ; OPT-LABEL: minimal_loop:
 ; OPT-NOT: br
-; OPT: BB7_1:
+; OPT: .LBB7_1:
 ; OPT: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
-; OPT: br BB7_1{{$}}
-; OPT: BB7_2:
+; OPT: br 0{{$}}
+; OPT: .LBB7_2:
 define i32 @minimal_loop(i32* %p) {
 entry:
   store volatile i32 0, i32* %p
@@ -264,17 +271,17 @@ loop:
 
 ; CHECK-LABEL: simple_loop:
 ; CHECK-NOT: br
-; CHECK: BB8_1:
-; CHECK: loop BB8_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB8_1{{$}}
-; CHECK: BB8_2:
+; CHECK: .LBB8_1:
+; CHECK: loop{{$}}
+; CHECK: br_if $pop{{[0-9]+}}, 0{{$}}
+; CHECK-NEXT: end_loop{{$}}
 ; CHECK: return ${{[0-9]+}}{{$}}
 ; OPT-LABEL: simple_loop:
 ; OPT-NOT: br
-; OPT: BB8_1:
-; OPT: loop BB8_2{{$}}
-; OPT: br_if {{[^,]*}}, BB8_1{{$}}
-; OPT: BB8_2:
+; OPT: .LBB8_1:
+; OPT: loop{{$}}
+; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT-NEXT: end_loop{{$}}
 ; OPT: return ${{[0-9]+}}{{$}}
 define i32 @simple_loop(i32* %p, i32 %a) {
 entry:
@@ -290,20 +297,20 @@ exit:
 }
 
 ; CHECK-LABEL: doubletriangle:
-; CHECK: block BB9_4{{$}}
-; CHECK: br_if $0, BB9_4{{$}}
-; CHECK: block BB9_3{{$}}
-; CHECK: br_if $1, BB9_3{{$}}
-; CHECK: BB9_3:
-; CHECK: BB9_4:
+; CHECK: block{{$}}
+; CHECK: br_if $0, 0{{$}}
+; CHECK: block{{$}}
+; CHECK: br_if $1, 0{{$}}
+; CHECK: .LBB9_3:
+; CHECK: .LBB9_4:
 ; CHECK: return ${{[0-9]+}}{{$}}
 ; OPT-LABEL: doubletriangle:
-; OPT: block BB9_4{{$}}
-; OPT: br_if $0, BB9_4{{$}}
-; OPT: block BB9_3{{$}}
-; OPT: br_if $1, BB9_3{{$}}
-; OPT: BB9_3:
-; OPT: BB9_4:
+; OPT: block{{$}}
+; OPT: br_if $0, 0{{$}}
+; OPT: block{{$}}
+; OPT: br_if $1, 0{{$}}
+; OPT: .LBB9_3:
+; OPT: .LBB9_4:
 ; OPT: return ${{[0-9]+}}{{$}}
 define i32 @doubletriangle(i32 %a, i32 %b, i32* %p) {
 entry:
@@ -326,22 +333,22 @@ exit:
 }
 
 ; CHECK-LABEL: ifelse_earlyexits:
-; CHECK: block BB10_4{{$}}
-; CHECK: block BB10_2{{$}}
-; CHECK: br_if $0, BB10_2{{$}}
-; CHECK: br BB10_4{{$}}
-; CHECK: BB10_2:
-; CHECK: br_if $1, BB10_4{{$}}
-; CHECK: BB10_4:
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: br_if $0, 0{{$}}
+; CHECK: br 1{{$}}
+; CHECK: .LBB10_2:
+; CHECK: br_if $1, 0{{$}}
+; CHECK: .LBB10_4:
 ; CHECK: return ${{[0-9]+}}{{$}}
 ; OPT-LABEL: ifelse_earlyexits:
-; OPT: block BB10_4{{$}}
-; OPT: block BB10_3{{$}}
-; OPT: br_if {{[^,]*}}, BB10_3{{$}}
-; OPT: br_if $1, BB10_4{{$}}
-; OPT: br BB10_4{{$}}
-; OPT: BB10_3:
-; OPT: BB10_4:
+; OPT: block{{$}}
+; OPT: block{{$}}
+; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if $1, 1{{$}}
+; OPT: br 1{{$}}
+; OPT: .LBB10_3:
+; OPT: .LBB10_4:
 ; OPT: return ${{[0-9]+}}{{$}}
 define i32 @ifelse_earlyexits(i32 %a, i32 %b, i32* %p) {
 entry:
@@ -364,35 +371,40 @@ exit:
 }
 
 ; CHECK-LABEL: doublediamond_in_a_loop:
-; CHECK: BB11_1:
-; CHECK: loop            BB11_7{{$}}
-; CHECK: block           BB11_6{{$}}
-; CHECK: block           BB11_3{{$}}
-; CHECK: br_if           $0, BB11_3{{$}}
-; CHECK: br              BB11_6{{$}}
-; CHECK: BB11_3:
-; CHECK: block           BB11_5{{$}}
-; CHECK: br_if           $1, BB11_5{{$}}
-; CHECK: br              BB11_6{{$}}
-; CHECK: BB11_5:
-; CHECK: BB11_6:
-; CHECK: br              BB11_1{{$}}
-; CHECK: BB11_7:
+; CHECK: .LBB11_1:
+; CHECK: loop{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: br_if           $0, 0{{$}}
+; CHECK: br              1{{$}}
+; CHECK: .LBB11_3:
+; CHECK: block{{$}}
+; CHECK: br_if           $1, 0{{$}}
+; CHECK: br              1{{$}}
+; CHECK: .LBB11_5:
+; CHECK: .LBB11_6:
+; CHECK: br              0{{$}}
+; CHECK: .LBB11_7:
+; CHECK-NEXT: end_loop{{$}}
 ; OPT-LABEL: doublediamond_in_a_loop:
-; OPT: BB11_1:
-; OPT: loop            BB11_7{{$}}
-; OPT: block           BB11_6{{$}}
-; OPT: block           BB11_5{{$}}
-; OPT: br_if           {{[^,]*}}, BB11_5{{$}}
-; OPT: block           BB11_4{{$}}
-; OPT: br_if           {{[^,]*}}, BB11_4{{$}}
-; OPT: br              BB11_6{{$}}
-; OPT: BB11_4:
-; OPT: br              BB11_6{{$}}
-; OPT: BB11_5:
-; OPT: BB11_6:
-; OPT: br              BB11_1{{$}}
-; OPT: BB11_7:
+; OPT: .LBB11_1:
+; OPT: loop{{$}}
+; OPT: block{{$}}
+; OPT: block{{$}}
+; OPT: br_if           {{[^,]*}}, 0{{$}}
+; OPT: block{{$}}
+; OPT: br_if           {{[^,]*}}, 0{{$}}
+; OPT: br              2{{$}}
+; OPT: .LBB11_4:
+; OPT-NEXT: end_block{{$}}
+; OPT: br              1{{$}}
+; OPT: .LBB11_5:
+; OPT-NEXT: end_block{{$}}
+; OPT: .LBB11_6:
+; OPT-NEXT: end_block{{$}}
+; OPT: br              0{{$}}
+; OPT: .LBB11_7:
+; OPT-NEXT: end_loop{{$}}
 define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) {
 entry:
   br label %header
@@ -423,12 +435,12 @@ exit:
 ; CHECK-LABEL: test3:
 ; CHECK: loop
 ; CHECK-NEXT: br_if
-; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK-NEXT: .LBB{{[0-9]+}}_{{[0-9]+}}:
 ; CHECK-NEXT: loop
 ; OPT-LABEL: test3:
 ; OPT: loop
 ; OPT-NEXT: br_if
-; OPT-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
+; OPT-NEXT: .LBB{{[0-9]+}}_{{[0-9]+}}:
 ; OPT-NEXT: loop
 declare void @bar()
 define void @test3(i32 %w)  {
@@ -460,42 +472,48 @@ if.end:
 ; Test switch lowering and block placement.
 
 ; CHECK-LABEL: test4:
-; CHECK-NEXT: .param      i32{{$}}
-; CHECK:      block       BB13_8{{$}}
-; CHECK-NEXT: block       BB13_7{{$}}
-; CHECK-NEXT: block       BB13_4{{$}}
-; CHECK:      br_if       $pop{{[0-9]*}}, BB13_4{{$}}
-; CHECK-NEXT: block       BB13_3{{$}}
-; CHECK:      br_if       $pop{{[0-9]*}}, BB13_3{{$}}
-; CHECK:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
-; CHECK-NEXT: BB13_3:
+; CHECK-NEXT: .param       i32{{$}}
+; CHECK:      block{{$}}
+; CHECK-NEXT: block{{$}}
+; CHECK-NEXT: block{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, 0{{$}}
+; CHECK-NEXT: block{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, 0{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, 2{{$}}
+; CHECK-NEXT: .LBB13_3:
+; CHECK-NEXT: end_block{{$}}
 ; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: BB13_4:
-; CHECK:      br_if       $pop{{[0-9]*}}, BB13_8{{$}}
-; CHECK:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
+; CHECK-NEXT: .LBB13_4:
+; CHECK:      br_if       $pop{{[0-9]*}}, 1{{$}}
+; CHECK:      br_if       $pop{{[0-9]*}}, 0{{$}}
 ; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: BB13_7:
+; CHECK-NEXT: .LBB13_7:
+; CHECK-NEXT: end_block{{$}}
 ; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: BB13_8:
+; CHECK-NEXT: .LBB13_8:
+; CHECK-NEXT: end_block{{$}}
 ; CHECK-NEXT: return{{$}}
 ; OPT-LABEL: test4:
-; OPT-NEXT: .param      i32{{$}}
-; OPT:      block       BB13_8{{$}}
-; OPT-NEXT: block       BB13_7{{$}}
-; OPT-NEXT: block       BB13_4{{$}}
-; OPT:      br_if       $pop{{[0-9]*}}, BB13_4{{$}}
-; OPT-NEXT: block       BB13_3{{$}}
-; OPT:      br_if       $pop{{[0-9]*}}, BB13_3{{$}}
-; OPT:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
-; OPT-NEXT: BB13_3:
+; OPT-NEXT: .param       i32{{$}}
+; OPT:      block{{$}}
+; OPT-NEXT: block{{$}}
+; OPT-NEXT: block{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, 0{{$}}
+; OPT-NEXT: block{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, 0{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, 2{{$}}
+; OPT-NEXT: .LBB13_3:
+; OPT-NEXT: end_block{{$}}
 ; OPT-NEXT: return{{$}}
-; OPT-NEXT: BB13_4:
-; OPT:      br_if       $pop{{[0-9]*}}, BB13_8{{$}}
-; OPT:      br_if       $pop{{[0-9]*}}, BB13_7{{$}}
+; OPT-NEXT: .LBB13_4:
+; OPT:      br_if       $pop{{[0-9]*}}, 1{{$}}
+; OPT:      br_if       $pop{{[0-9]*}}, 0{{$}}
 ; OPT-NEXT: return{{$}}
-; OPT-NEXT: BB13_7:
+; OPT-NEXT: .LBB13_7:
+; OPT-NEXT: end_block{{$}}
 ; OPT-NEXT: return{{$}}
-; OPT-NEXT: BB13_8:
+; OPT-NEXT: .LBB13_8:
+; OPT-NEXT: end_block{{$}}
 ; OPT-NEXT: return{{$}}
 define void @test4(i32 %t) {
 entry:
@@ -523,24 +541,24 @@ default:
 ; same basic block.
 
 ; CHECK-LABEL: test5:
-; CHECK:       BB14_1:
-; CHECK-NEXT:  block BB14_4{{$}}
-; CHECK-NEXT:  loop BB14_3{{$}}
-; CHECK:       br_if {{[^,]*}}, BB14_4{{$}}
-; CHECK:       br_if {{[^,]*}}, BB14_1{{$}}
-; CHECK-NEXT:  BB14_3:
+; CHECK:       .LBB14_1:
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  loop{{$}}
+; CHECK:       br_if {{[^,]*}}, 2{{$}}
+; CHECK:       br_if {{[^,]*}}, 0{{$}}
+; CHECK-NEXT:  end_loop{{$}}
 ; CHECK:       return{{$}}
-; CHECK-NEXT:  BB14_4:
+; CHECK-NEXT:  .LBB14_4:
 ; CHECK:       return{{$}}
 ; OPT-LABEL: test5:
-; OPT:       BB14_1:
-; OPT-NEXT:  block BB14_4{{$}}
-; OPT-NEXT:  loop BB14_3{{$}}
-; OPT:       br_if {{[^,]*}}, BB14_4{{$}}
-; OPT:       br_if {{[^,]*}}, BB14_1{{$}}
-; OPT-NEXT:  BB14_3:
+; OPT:       .LBB14_1:
+; OPT-NEXT:  block{{$}}
+; OPT-NEXT:  loop{{$}}
+; OPT:       br_if {{[^,]*}}, 2{{$}}
+; OPT:       br_if {{[^,]*}}, 0{{$}}
+; OPT-NEXT:  end_loop{{$}}
 ; OPT:       return{{$}}
-; OPT-NEXT:  BB14_4:
+; OPT-NEXT:  .LBB14_4:
 ; OPT:       return{{$}}
 define void @test5(i1 %p, i1 %q) {
 entry:
@@ -568,41 +586,45 @@ return:
 ; which has another predecessor.
 
 ; CHECK-LABEL: test6:
-; CHECK:       BB15_1:
-; CHECK-NEXT:  block BB15_6{{$}}
-; CHECK-NEXT:  block BB15_5{{$}}
-; CHECK-NEXT:  loop  BB15_4{{$}}
+; CHECK:       .LBB15_1:
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if {{[^,]*}}, BB15_6{{$}}
+; CHECK:       br_if {{[^,]*}}, 3{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if {{[^,]*}}, BB15_5{{$}}
+; CHECK:       br_if {{[^,]*}}, 2{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if {{[^,]*}}, BB15_1{{$}}
-; CHECK-NEXT:  BB15_4:
+; CHECK:       br_if {{[^,]*}}, 0{{$}}
+; CHECK-NEXT:  end_loop{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       return{{$}}
-; CHECK-NEXT:  BB15_5:
+; CHECK-NEXT:  .LBB15_5:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       BB15_6:
+; CHECK:       .LBB15_6:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       return{{$}}
 ; OPT-LABEL: test6:
-; OPT:       BB15_1:
-; OPT-NEXT:  block BB15_6{{$}}
-; OPT-NEXT:  block BB15_5{{$}}
-; OPT-NEXT:  loop  BB15_4{{$}}
+; OPT:       .LBB15_1:
+; OPT-NEXT:  block{{$}}
+; OPT-NEXT:  block{{$}}
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if {{[^,]*}}, BB15_6{{$}}
+; OPT:       br_if {{[^,]*}}, 3{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if {{[^,]*}}, BB15_5{{$}}
+; OPT:       br_if {{[^,]*}}, 2{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if {{[^,]*}}, BB15_1{{$}}
-; OPT-NEXT:  BB15_4:
+; OPT:       br_if {{[^,]*}}, 0{{$}}
+; OPT-NEXT:  end_loop{{$}}
 ; OPT-NOT:   block
 ; OPT:       return{{$}}
-; OPT-NEXT:  BB15_5:
+; OPT-NEXT:  .LBB15_5:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       BB15_6:
+; OPT:       .LBB15_6:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
 ; OPT:       return{{$}}
 define void @test6(i1 %p, i1 %q) {
@@ -638,36 +660,38 @@ second:
 ; that end in unreachable.
 
 ; CHECK-LABEL: test7:
-; CHECK:       BB16_1:
-; CHECK-NEXT:  loop BB16_5{{$}}
+; CHECK:       .LBB16_1:
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       block BB16_4{{$}}
-; CHECK:       br_if {{[^,]*}}, BB16_4{{$}}
+; CHECK:       block{{$}}
+; CHECK:       br_if {{[^,]*}}, 0{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if {{[^,]*}}, BB16_1{{$}}
+; CHECK:       br_if {{[^,]*}}, 1{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       unreachable
-; CHECK_NEXT:  BB16_4:
+; CHECK-NEXT:  .LBB16_4:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if {{[^,]*}}, BB16_1{{$}}
-; CHECK-NEXT:  BB16_5:
+; CHECK:       br_if {{[^,]*}}, 0{{$}}
+; CHECK-NEXT:  end_loop{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       unreachable
 ; OPT-LABEL: test7:
-; OPT:       BB16_1:
-; OPT-NEXT:  loop BB16_5{{$}}
+; OPT:       .LBB16_1:
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       block BB16_4{{$}}
+; OPT:       block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if {{[^,]*}}, BB16_4{{$}}
+; OPT:       br_if {{[^,]*}}, 0{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if {{[^,]*}}, BB16_1{{$}}
+; OPT:       br_if {{[^,]*}}, 1{{$}}
 ; OPT-NOT:   block
 ; OPT:       unreachable
-; OPT_NEXT:  BB16_4:
+; OPT-NEXT:  .LBB16_4:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if {{[^,]*}}, BB16_1{{$}}
-; OPT-NEXT:  BB16_5:
+; OPT:       br_if {{[^,]*}}, 0{{$}}
+; OPT-NEXT:  end_loop{{$}}
 ; OPT-NOT:   block
 ; OPT:       unreachable
 define void @test7(i1 %tobool2, i1 %tobool9) {
@@ -699,31 +723,33 @@ u1:
 ; Test an interesting case using nested loops and switches.
 
 ; CHECK-LABEL: test8:
-; CHECK:       BB17_1:
-; CHECK-NEXT:  loop     BB17_4{{$}}
-; CHECK-NEXT:  block    BB17_3{{$}}
+; CHECK:       .LBB17_1:
+; CHECK-NEXT:  loop{{$}}
+; CHECK-NEXT:  block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if    {{[^,]*}}, BB17_3{{$}}
+; CHECK:       br_if    {{[^,]*}}, 0{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if    {{[^,]*}}, BB17_1{{$}}
-; CHECK-NEXT:  BB17_3:
-; CHECK-NEXT:  loop     BB17_4{{$}}
-; CHECK-NEXT:  br_if    {{[^,]*}}, BB17_3{{$}}
-; CHECK-NEXT:  br       BB17_1{{$}}
-; CHECK-NEXT:  BB17_4:
+; CHECK:       br_if    {{[^,]*}}, 1{{$}}
+; CHECK-NEXT:  .LBB17_3:
+; CHECK-NEXT:  end_block{{$}}
+; CHECK-NEXT:  loop{{$}}
+; CHECK-NEXT:  br_if    {{[^,]*}}, 0{{$}}
+; CHECK-NEXT:  br       2{{$}}
+; CHECK-NEXT:  .LBB17_4:
 ; OPT-LABEL: test8:
-; OPT:       BB17_1:
-; OPT-NEXT:  loop     BB17_4{{$}}
-; OPT-NEXT:  block    BB17_3{{$}}
+; OPT:       .LBB17_1:
+; OPT-NEXT:  loop{{$}}
+; OPT-NEXT:  block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if    {{[^,]*}}, BB17_3{{$}}
+; OPT:       br_if    {{[^,]*}}, 0{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if    {{[^,]*}}, BB17_1{{$}}
-; OPT-NEXT:  BB17_3:
-; OPT-NEXT:  loop     BB17_4{{$}}
-; OPT-NEXT:  br_if    {{[^,]*}}, BB17_3{{$}}
-; OPT-NEXT:  br       BB17_1{{$}}
-; OPT-NEXT:  BB17_4:
+; OPT:       br_if    {{[^,]*}}, 1{{$}}
+; OPT-NEXT:  .LBB17_3:
+; OPT-NEXT:  end_block{{$}}
+; OPT-NEXT:  loop{{$}}
+; OPT-NEXT:  br_if    {{[^,]*}}, 0{{$}}
+; OPT-NEXT:  br       2{{$}}
+; OPT-NEXT:  .LBB17_4:
 define i32 @test8() {
 bb:
   br label %bb1
@@ -745,45 +771,47 @@ bb3:
 ; Test an interesting case using nested loops that share a bottom block.
 
 ; CHECK-LABEL: test9:
-; CHECK:       BB18_1:
-; CHECK-NEXT:  loop      BB18_5{{$}}
+; CHECK:       .LBB18_1:
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if     {{[^,]*}}, BB18_5{{$}}
-; CHECK-NEXT:  BB18_2:
-; CHECK-NEXT:  loop      BB18_5{{$}}
+; CHECK:       br_if     {{[^,]*}}, 1{{$}}
+; CHECK-NEXT:  .LBB18_2:
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       block     BB18_4{{$}}
+; CHECK:       block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if     {{[^,]*}}, BB18_4{{$}}
+; CHECK:       br_if     {{[^,]*}}, 0{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if     {{[^,]*}}, BB18_2{{$}}
-; CHECK-NEXT:  br        BB18_1{{$}}
-; CHECK-NEXT:  BB18_4:
+; CHECK:       br_if     {{[^,]*}}, 1{{$}}
+; CHECK-NEXT:  br        3{{$}}
+; CHECK-NEXT:  .LBB18_4:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if     {{[^,]*}}, BB18_2{{$}}
-; CHECK-NEXT:  br        BB18_1{{$}}
-; CHECK-NEXT:  BB18_5:
+; CHECK:       br_if     {{[^,]*}}, 0{{$}}
+; CHECK-NEXT:  br        2{{$}}
+; CHECK-NEXT:  .LBB18_5:
 ; CHECK-NOT:   block
 ; CHECK:       return{{$}}
 ; OPT-LABEL: test9:
-; OPT:       BB18_1:
-; OPT-NEXT:  loop      BB18_5{{$}}
+; OPT:       .LBB18_1:
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if     {{[^,]*}}, BB18_5{{$}}
-; OPT-NEXT:  BB18_2:
-; OPT-NEXT:  loop      BB18_5{{$}}
+; OPT:       br_if     {{[^,]*}}, 1{{$}}
+; OPT-NEXT:  .LBB18_2:
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       block     BB18_4{{$}}
+; OPT:       block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if     {{[^,]*}}, BB18_4{{$}}
+; OPT:       br_if     {{[^,]*}}, 0{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if     {{[^,]*}}, BB18_2{{$}}
-; OPT-NEXT:  br        BB18_1{{$}}
-; OPT-NEXT:  BB18_4:
+; OPT:       br_if     {{[^,]*}}, 1{{$}}
+; OPT-NEXT:  br        3{{$}}
+; OPT-NEXT:  .LBB18_4:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if     {{[^,]*}}, BB18_2{{$}}
-; OPT-NEXT:  br        BB18_1{{$}}
-; OPT-NEXT:  BB18_5:
+; OPT:       br_if     {{[^,]*}}, 0{{$}}
+; OPT-NEXT:  br        2{{$}}
+; OPT-NEXT:  .LBB18_5:
 ; OPT-NOT:   block
 ; OPT:       return{{$}}
 declare i1 @a()
@@ -821,47 +849,53 @@ end:
 ; and loop exits to a block with unreachable.
 
 ; CHECK-LABEL: test10:
-; CHECK:       BB19_1:
-; CHECK-NEXT:  loop     BB19_7{{$}}
+; CHECK:       .LBB19_1:
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if    {{[^,]*}}, BB19_1{{$}}
-; CHECK-NEXT:  BB19_2:
-; CHECK-NEXT:  block    BB19_6{{$}}
-; CHECK-NEXT:  loop     BB19_5{{$}}
+; CHECK:       br_if    {{[^,]*}}, 0{{$}}
+; CHECK-NEXT:  .LBB19_2:
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       BB19_3:
-; CHECK-NEXT:  loop     BB19_5{{$}}
+; CHECK:       .LBB19_3:
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if    {{[^,]*}}, BB19_1{{$}}
+; CHECK:       br_if    {{[^,]*}}, 5{{$}}
 ; CHECK-NOT:   block
-; CHECK:       tableswitch  {{[^,]*}}, BB19_3, BB19_3, BB19_5, BB19_1, BB19_2, BB19_6{{$}}
-; CHECK-NEXT:  BB19_5:
+; CHECK:       tableswitch  {{[^,]*}}, 0, 0, 1, 5, 2, 4{{$}}
+; CHECK-NEXT:  .LBB19_5:
+; CHECK-NEXT:  end_loop{{$}}
+; CHECK-NEXT:  end_loop{{$}}
 ; CHECK-NEXT:  return{{$}}
-; CHECK-NEXT:  BB19_6:
+; CHECK-NEXT:  .LBB19_6:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br       BB19_1{{$}}
-; CHECK-NEXT:  BB19_7:
+; CHECK:       br       0{{$}}
+; CHECK-NEXT:  .LBB19_7:
 ; OPT-LABEL: test10:
-; OPT:       BB19_1:
-; OPT-NEXT:  loop     BB19_7{{$}}
+; OPT:       .LBB19_1:
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if    {{[^,]*}}, BB19_1{{$}}
-; OPT-NEXT:  BB19_2:
-; OPT-NEXT:  block    BB19_6{{$}}
-; OPT-NEXT:  loop     BB19_5{{$}}
+; OPT:       br_if    {{[^,]*}}, 0{{$}}
+; OPT-NEXT:  .LBB19_2:
+; OPT-NEXT:  block{{$}}
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       BB19_3:
-; OPT-NEXT:  loop     BB19_5{{$}}
+; OPT:       .LBB19_3:
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if    {{[^,]*}}, BB19_1{{$}}
+; OPT:       br_if    {{[^,]*}}, 5{{$}}
 ; OPT-NOT:   block
-; OPT:       tableswitch  {{[^,]*}}, BB19_3, BB19_3, BB19_5, BB19_1, BB19_2, BB19_6{{$}}
-; OPT-NEXT:  BB19_5:
+; OPT:       tableswitch  {{[^,]*}}, 0, 0, 1, 5, 2, 4{{$}}
+; OPT-NEXT:  .LBB19_5:
+; OPT-NEXT:  end_loop{{$}}
+; OPT-NEXT:  end_loop{{$}}
 ; OPT-NEXT:  return{{$}}
-; OPT-NEXT:  BB19_6:
+; OPT-NEXT:  .LBB19_6:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       br       BB19_1{{$}}
-; OPT-NEXT:  BB19_7:
+; OPT:       br       0{{$}}
+; OPT-NEXT:  .LBB19_7:
 define void @test10() {
 bb0:
   br label %bb1
@@ -900,58 +934,67 @@ bb6:
 ; Test a CFG DAG with interesting merging.
 
 ; CHECK-LABEL: test11:
-; CHECK:       block        BB20_8{{$}}
-; CHECK-NEXT:  block        BB20_7{{$}}
-; CHECK-NEXT:  block        BB20_6{{$}}
-; CHECK-NEXT:  block        BB20_4{{$}}
-; CHECK-NEXT:  br_if        {{[^,]*}}, BB20_4{{$}}
+; CHECK:       block{{$}}
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  br_if        {{[^,]*}}, 0{{$}}
+; CHECK-NEXT:  block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       block        BB20_3{{$}}
-; CHECK:       br_if        {{[^,]*}}, BB20_3{{$}}
+; CHECK:       br_if        {{[^,]*}}, 0{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if        {{[^,]*}}, BB20_6{{$}}
-; CHECK-NEXT:  BB20_3:
+; CHECK:       br_if        {{[^,]*}}, 2{{$}}
+; CHECK-NEXT:  .LBB20_3:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       return{{$}}
-; CHECK-NEXT:  BB20_4:
+; CHECK-NEXT:  .LBB20_4:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if        {{[^,]*}}, BB20_8{{$}}
+; CHECK:       br_if        {{[^,]*}}, 2{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if        {{[^,]*}}, BB20_7{{$}}
-; CHECK-NEXT:  BB20_6:
+; CHECK:       br_if        {{[^,]*}}, 1{{$}}
+; CHECK-NEXT:  .LBB20_6:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       return{{$}}
-; CHECK-NEXT:  BB20_7:
+; CHECK-NEXT:  .LBB20_7:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       return{{$}}
-; CHECK-NEXT:  BB20_8:
+; CHECK-NEXT:  .LBB20_8:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
 ; CHECK:       return{{$}}
 ; OPT-LABEL: test11:
-; OPT:       block        BB20_8{{$}}
-; OPT-NEXT:  block        BB20_4{{$}}
-; OPT-NEXT:  br_if        $0, BB20_4{{$}}
+; OPT:       block{{$}}
+; OPT-NEXT:  block{{$}}
+; OPT-NEXT:  br_if        $0, 0{{$}}
+; OPT-NEXT:  block{{$}}
 ; OPT-NOT:   block
-; OPT:       block        BB20_3{{$}}
-; OPT:       br_if        $0, BB20_3{{$}}
+; OPT:       br_if        $0, 0{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if        $0, BB20_8{{$}}
-; OPT-NEXT:  BB20_3:
+; OPT:       br_if        $0, 2{{$}}
+; OPT-NEXT:  .LBB20_3:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
 ; OPT:       return{{$}}
-; OPT-NEXT:  BB20_4:
+; OPT-NEXT:  .LBB20_4:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       block        BB20_6{{$}}
+; OPT:       block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if        $pop9, BB20_6{{$}}
+; OPT:       br_if        $pop9, 0{{$}}
 ; OPT-NOT:   block
 ; OPT:       return{{$}}
-; OPT-NEXT:  BB20_6:
+; OPT-NEXT:  .LBB20_6:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if        $0, BB20_8{{$}}
+; OPT:       br_if        $0, 0{{$}}
 ; OPT-NOT:   block
 ; OPT:       return{{$}}
-; OPT-NEXT:  BB20_8:
+; OPT-NEXT:  .LBB20_8:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
 ; OPT:       return{{$}}
 define void @test11() {
@@ -985,53 +1028,59 @@ bb8:
 }
 
 ; CHECK-LABEL: test12:
-; CHECK:       BB21_1:
-; CHECK-NEXT:  loop        BB21_8{{$}}
+; CHECK:       .LBB21_1:
+; CHECK-NEXT:  loop{{$}}
 ; CHECK-NOT:   block
-; CHECK:       block       BB21_7{{$}}
-; CHECK-NEXT:  block       BB21_6{{$}}
-; CHECK-NEXT:  block       BB21_4{{$}}
-; CHECK:       br_if       {{[^,]*}}, BB21_4{{$}}
+; CHECK:       block{{$}}
+; CHECK-NEXT:  block{{$}}
+; CHECK-NEXT:  block{{$}}
+; CHECK:       br_if       {{[^,]*}}, 0{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
+; CHECK:       br_if       {{[^,]*}}, 2{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
-; CHECK-NEXT:  br          BB21_6{{$}}
-; CHECK-NEXT:  BB21_4:
+; CHECK:       br_if       {{[^,]*}}, 2{{$}}
+; CHECK-NEXT:  br          1{{$}}
+; CHECK-NEXT:  .LBB21_4:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
+; CHECK:       br_if       {{[^,]*}}, 1{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br_if       {{[^,]*}}, BB21_7{{$}}
-; CHECK-NEXT:  BB21_6:
+; CHECK:       br_if       {{[^,]*}}, 1{{$}}
+; CHECK-NEXT:  .LBB21_6:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NEXT:  return{{$}}
-; CHECK-NEXT:  BB21_7:
+; CHECK-NEXT:  .LBB21_7:
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NOT:   block
-; CHECK:       br          BB21_1{{$}}
-; CHECK-NEXT:  BB21_8:
+; CHECK:       br          0{{$}}
+; CHECK-NEXT:  .LBB21_8:
 ; OPT-LABEL: test12:
-; OPT:       BB21_1:
-; OPT-NEXT:  loop        BB21_8{{$}}
+; OPT:       .LBB21_1:
+; OPT-NEXT:  loop{{$}}
 ; OPT-NOT:   block
-; OPT:       block       BB21_7{{$}}
-; OPT-NEXT:  block       BB21_6{{$}}
-; OPT-NEXT:  block       BB21_4{{$}}
-; OPT:       br_if       {{[^,]*}}, BB21_4{{$}}
+; OPT:       block{{$}}
+; OPT-NEXT:  block{{$}}
+; OPT-NEXT:  block{{$}}
+; OPT:       br_if       {{[^,]*}}, 0{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
+; OPT:       br_if       {{[^,]*}}, 2{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
-; OPT-NEXT:  br          BB21_6{{$}}
-; OPT-NEXT:  BB21_4:
+; OPT:       br_if       {{[^,]*}}, 2{{$}}
+; OPT-NEXT:  br          1{{$}}
+; OPT-NEXT:  .LBB21_4:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
+; OPT:       br_if       {{[^,]*}}, 1{{$}}
 ; OPT-NOT:   block
-; OPT:       br_if       {{[^,]*}}, BB21_7{{$}}
-; OPT-NEXT:  BB21_6:
+; OPT:       br_if       {{[^,]*}}, 1{{$}}
+; OPT-NEXT:  .LBB21_6:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NEXT:  return{{$}}
-; OPT-NEXT:  BB21_7:
+; OPT-NEXT:  .LBB21_7:
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NOT:   block
-; OPT:       br          BB21_1{{$}}
-; OPT-NEXT:  BB21_8:
+; OPT:       br          0{{$}}
+; OPT-NEXT:  .LBB21_8:
 define void @test12(i8* %arg) {
 bb:
   br label %bb1
@@ -1060,30 +1109,34 @@ bb7:
 ; optnone to disable optimizations to test this case.
 
 ; CHECK-LABEL: test13:
-; CHECK-NEXT:  .local i32{{$}}
-; CHECK:       block BB22_2{{$}}
-; CHECK:       br_if $pop4, BB22_2{{$}}
+; CHECK-NEXT:  local i32{{$}}
+; CHECK:       block{{$}}
+; CHECK:       br_if $pop4, 0{{$}}
 ; CHECK-NEXT:  return{{$}}
-; CHECK-NEXT:  BB22_2:
-; CHECK:       block BB22_4{{$}}
-; CHECK-NEXT:  br_if $0, BB22_4{{$}}
-; CHECK:       BB22_4:
-; CHECK:       block BB22_5{{$}}
-; CHECK:       br_if $pop6, BB22_5{{$}}
-; CHECK-NEXT:  BB22_5:
+; CHECK-NEXT:  .LBB22_2:
+; CHECK-NEXT:  end_block{{$}}
+; CHECK:       block{{$}}
+; CHECK-NEXT:  br_if $0, 0{{$}}
+; CHECK:       .LBB22_4:
+; CHECK-NEXT:  end_block{{$}}
+; CHECK:       block{{$}}
+; CHECK:       br_if $pop6, 0{{$}}
+; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NEXT:  unreachable{{$}}
 ; OPT-LABEL: test13:
-; OPT-NEXT:  .local i32{{$}}
-; OPT:       block BB22_2{{$}}
-; OPT:       br_if $pop4, BB22_2{{$}}
+; OPT-NEXT:  local i32{{$}}
+; OPT:       block{{$}}
+; OPT:       br_if $pop4, 0{{$}}
 ; OPT-NEXT:  return{{$}}
-; OPT-NEXT:  BB22_2:
-; OPT:       block BB22_4{{$}}
-; OPT-NEXT:  br_if $0, BB22_4{{$}}
-; OPT:       BB22_4:
-; OPT:       block BB22_5{{$}}
-; OPT:       br_if $pop6, BB22_5{{$}}
-; OPT-NEXT:  BB22_5:
+; OPT-NEXT:  .LBB22_2:
+; OPT-NEXT:  end_block{{$}}
+; OPT:       block{{$}}
+; OPT-NEXT:  br_if $0, 0{{$}}
+; OPT:       .LBB22_4:
+; OPT-NEXT:  end_block{{$}}
+; OPT:       block{{$}}
+; OPT:       br_if $pop6, 0{{$}}
+; OPT-NEXT:  end_block{{$}}
 ; OPT-NEXT:  unreachable{{$}}
 define void @test13() noinline optnone {
 bb:
@@ -1100,3 +1153,65 @@ bb4:
 bb5:
   ret void
 }
+
+; Test a case with a single-block loop that has another loop
+; as a successor. The end_loop for the first loop should go
+; before the loop for the second.
+
+; CHECK-LABEL: test14:
+; CHECK-NEXT:     local       i32{{$}}
+; CHECK-NEXT:     i32.const   $0=, 0{{$}}
+; CHECK-NEXT: .LBB23_1:{{$}}
+; CHECK-NEXT:     loop{{$}}
+; CHECK-NEXT:     br_if       $0, 0{{$}}
+; CHECK-NEXT: .LBB23_2:{{$}}
+; CHECK-NEXT:     end_loop{{$}}
+; CHECK-NEXT:     loop{{$}}
+; CHECK-NEXT:     br_if       $0, 0{{$}}
+; CHECK-NEXT:     end_loop{{$}}
+; CHECK-NEXT:     return{{$}}
+define void @test14() {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = bitcast i1 undef to i1
+  br i1 %tmp, label %bb3, label %bb1
+
+bb3:
+  br label %bb4
+
+bb4:
+  br i1 undef, label %bb7, label %bb48
+
+bb7:
+  br i1 undef, label %bb12, label %bb12
+
+bb12:
+  br i1 undef, label %bb17, label %bb17
+
+bb17:
+  br i1 undef, label %bb22, label %bb22
+
+bb22:
+  br i1 undef, label %bb27, label %bb27
+
+bb27:
+  br i1 undef, label %bb30, label %bb30
+
+bb30:
+  br i1 undef, label %bb35, label %bb35
+
+bb35:
+  br i1 undef, label %bb38, label %bb38
+
+bb38:
+  br i1 undef, label %bb48, label %bb48
+
+bb48:
+  %tmp49 = bitcast i1 undef to i1
+  br i1 %tmp49, label %bb3, label %bb50
+
+bb50:
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/comparisons_f32.ll b/test/CodeGen/WebAssembly/comparisons_f32.ll
index 6df37ea1c6dd5..2d324f7f20830 100644
--- a/test/CodeGen/WebAssembly/comparisons_f32.ll
+++ b/test/CodeGen/WebAssembly/comparisons_f32.ll
@@ -3,7 +3,7 @@
 ; Test that basic 32-bit floating-point comparison operations assemble as
 ; expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: ord_f32:
diff --git a/test/CodeGen/WebAssembly/comparisons_f64.ll b/test/CodeGen/WebAssembly/comparisons_f64.ll
index f5acc64b667c3..22fbc1ae4c1fd 100644
--- a/test/CodeGen/WebAssembly/comparisons_f64.ll
+++ b/test/CodeGen/WebAssembly/comparisons_f64.ll
@@ -3,7 +3,7 @@
 ; Test that basic 64-bit floating-point comparison operations assemble as
 ; expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: ord_f64:
diff --git a/test/CodeGen/WebAssembly/comparisons_i32.ll b/test/CodeGen/WebAssembly/comparisons_i32.ll
index b724cec1cc632..db81ef36e2701 100644
--- a/test/CodeGen/WebAssembly/comparisons_i32.ll
+++ b/test/CodeGen/WebAssembly/comparisons_i32.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic 32-bit integer comparison operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: eq_i32:
diff --git a/test/CodeGen/WebAssembly/comparisons_i64.ll b/test/CodeGen/WebAssembly/comparisons_i64.ll
index 898591999bec2..19e5cf8603bfe 100644
--- a/test/CodeGen/WebAssembly/comparisons_i64.ll
+++ b/test/CodeGen/WebAssembly/comparisons_i64.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic 64-bit integer comparison operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: eq_i64:
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index e1acaca2c9ecf..1a4bd72d72d6e 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic conversion operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: i32_wrap_i64:
diff --git a/test/CodeGen/WebAssembly/copysign-casts.ll b/test/CodeGen/WebAssembly/copysign-casts.ll
index 760e491330183..f8e50d043ca91 100644
--- a/test/CodeGen/WebAssembly/copysign-casts.ll
+++ b/test/CodeGen/WebAssembly/copysign-casts.ll
@@ -3,7 +3,7 @@
 ; DAGCombiner oddly folds casts into the rhs of copysign. Test that they get
 ; unfolded.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare double @copysign(double, double) nounwind readnone
diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll
index 2b77c5f475c83..51856fcd12c2d 100644
--- a/test/CodeGen/WebAssembly/cpus.ll
+++ b/test/CodeGen/WebAssembly/cpus.ll
@@ -1,13 +1,13 @@
 ; This tests that llc accepts all valid WebAssembly CPUs.
 
-; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
-; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}} is not a recognized processor for this target
 ; INVALID: {{.*}} is not a recognized processor for this target
diff --git a/test/CodeGen/WebAssembly/dead-vreg.ll b/test/CodeGen/WebAssembly/dead-vreg.ll
index b03e1569fde6b..29a41990961d1 100644
--- a/test/CodeGen/WebAssembly/dead-vreg.ll
+++ b/test/CodeGen/WebAssembly/dead-vreg.ll
@@ -2,7 +2,7 @@
 
 ; Check that unused vregs aren't assigned registers.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 define void @foo(i32* nocapture %a, i32 %w, i32 %h) {
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index 777010064cdba..c32a7c3dc7d9d 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic 32-bit floating-point operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare float @llvm.fabs.f32(float)
@@ -146,7 +146,7 @@ define float @fmax32(float %x) {
 }
 
 ; CHECK-LABEL: fma32:
-; CHECK: {{^}} f32.call $push0=, fmaf, $0, $1, $2{{$}}
+; CHECK: {{^}} f32.call $push0=, fmaf@FUNCTION, $0, $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define float @fma32(float %a, float %b, float %c) {
   %d = call float @llvm.fma.f32(float %a, float %b, float %c)
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index 302ee79389b38..92284999cbf7c 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic 64-bit floating-point operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare double @llvm.fabs.f64(double)
@@ -146,7 +146,7 @@ define double @fmax64(double %x) {
 }
 
 ; CHECK-LABEL: fma64:
-; CHECK: {{^}} f64.call $push0=, fma, $0, $1, $2{{$}}
+; CHECK: {{^}} f64.call $push0=, fma@FUNCTION, $0, $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define double @fma64(double %a, double %b, double %c) {
   %d = call double @llvm.fma.f64(double %a, double %b, double %c)
diff --git a/test/CodeGen/WebAssembly/fast-isel.ll b/test/CodeGen/WebAssembly/fast-isel.ll
index 07d78c1415e5f..7f9f20fa70834 100644
--- a/test/CodeGen/WebAssembly/fast-isel.ll
+++ b/test/CodeGen/WebAssembly/fast-isel.ll
@@ -2,7 +2,7 @@
 ; RUN:   -fast-isel -fast-isel-abort=1 -verify-machineinstrs \
 ; RUN:   | FileCheck %s
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; This tests very minimal fast-isel functionality.
diff --git a/test/CodeGen/WebAssembly/frem.ll b/test/CodeGen/WebAssembly/frem.ll
index 688370313b486..b8c80fbe69978 100644
--- a/test/CodeGen/WebAssembly/frem.ll
+++ b/test/CodeGen/WebAssembly/frem.ll
@@ -2,13 +2,13 @@
 
 ; Test that the frem instruction works.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: frem32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: {{^}} f32.call $push0=, fmodf, $0, $1{{$}}
+; CHECK-NEXT: {{^}} f32.call $push0=, fmodf@FUNCTION, $0, $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define float @frem32(float %x, float %y) {
   %a = frem float %x, %y
@@ -18,7 +18,7 @@ define float @frem32(float %x, float %y) {
 ; CHECK-LABEL: frem64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: {{^}} f64.call $push0=, fmod, $0, $1{{$}}
+; CHECK-NEXT: {{^}} f64.call $push0=, fmod@FUNCTION, $0, $1{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define double @frem64(double %x, double %y) {
   %a = frem double %x, %y
diff --git a/test/CodeGen/WebAssembly/func.ll b/test/CodeGen/WebAssembly/func.ll
index 6f42dc744ac79..9857dadee414d 100644
--- a/test/CodeGen/WebAssembly/func.ll
+++ b/test/CodeGen/WebAssembly/func.ll
@@ -2,11 +2,12 @@
 
 ; Test that basic functions assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: f0:
 ; CHECK: return{{$}}
+; CHECK: .endfunc{{$}}
 ; CHECK: .size f0,
 define void @f0() {
   ret void
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
index 5f149ed067c82..85fe5c8965655 100644
--- a/test/CodeGen/WebAssembly/global.ll
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -2,7 +2,7 @@
 
 ; Test that globals assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-NOT: llvm.used
@@ -21,7 +21,7 @@ define i32 @foo() {
 ; CHECK-LABEL: call_memcpy:
 ; CHECK-NEXT: .param          i32, i32, i32{{$}}
 ; CHECK-NEXT: .result         i32{{$}}
-; CHECK-NEXT: call            memcpy, $0, $1, $2{{$}}
+; CHECK-NEXT: call            memcpy@FUNCTION, $0, $1, $2{{$}}
 ; CHECK-NEXT: return          $0{{$}}
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
 define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
@@ -29,15 +29,15 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
   ret i8* %p
 }
 
-; CHECK: .type   g,@object
+; CHECK: .type   .Lg,@object
 ; CHECK: .align  2{{$}}
-; CHECK-NEXT: g:
+; CHECK-NEXT: .Lg:
 ; CHECK-NEXT: .int32 1337{{$}}
-; CHECK-NEXT: .size g, 4{{$}}
+; CHECK-NEXT: .size .Lg, 4{{$}}
 @g = private global i32 1337
 
 ; CHECK-LABEL: ud:
-; CHECK-NEXT: .zero 4{{$}}
+; CHECK-NEXT: .skip 4{{$}}
 ; CHECK-NEXT: .size ud, 4{{$}}
 @ud = internal global i32 undef
 
@@ -73,7 +73,7 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 ; CHECK: .type ud64,@object
 ; CHECK: .align 3{{$}}
 ; CHECK-NEXT: ud64:
-; CHECK-NEXT: .zero 8{{$}}
+; CHECK-NEXT: .skip 8{{$}}
 ; CHECK-NEXT: .size ud64, 8{{$}}
 @ud64 = internal global i64 undef
 
@@ -102,7 +102,7 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 ; CHECK: .type f32ud,@object
 ; CHECK: .align 2{{$}}
 ; CHECK-NEXT: f32ud:
-; CHECK-NEXT: .zero 4{{$}}
+; CHECK-NEXT: .skip 4{{$}}
 ; CHECK-NEXT: .size f32ud, 4{{$}}
 @f32ud = internal global float undef
 
@@ -131,7 +131,7 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 ; CHECK: .type f64ud,@object
 ; CHECK: .align 3{{$}}
 ; CHECK-NEXT: f64ud:
-; CHECK-NEXT: .zero 8{{$}}
+; CHECK-NEXT: .skip 8{{$}}
 ; CHECK-NEXT: .size f64ud, 8{{$}}
 @f64ud = internal global double undef
 
@@ -172,6 +172,6 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 ; CHECK: .globl   rom{{$}}
 ; CHECK: .align   4{{$}}
 ; CHECK: rom:
-; CHECK: .zero    512{{$}}
+; CHECK: .skip    512{{$}}
 ; CHECK: .size    rom, 512{{$}}
 @rom = constant [128 x i32] zeroinitializer, align 16
diff --git a/test/CodeGen/WebAssembly/globl.ll b/test/CodeGen/WebAssembly/globl.ll
index a5dc028c1db40..91d3ade4666b4 100644
--- a/test/CodeGen/WebAssembly/globl.ll
+++ b/test/CodeGen/WebAssembly/globl.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -asm-verbose=false | FileCheck %s
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK: .globl foo
diff --git a/test/CodeGen/WebAssembly/i32.ll b/test/CodeGen/WebAssembly/i32.ll
index ab29b0472bf2d..10d97ad9e6d1e 100644
--- a/test/CodeGen/WebAssembly/i32.ll
+++ b/test/CodeGen/WebAssembly/i32.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic 32-bit integer operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare i32 @llvm.ctlz.i32(i32, i1)
diff --git a/test/CodeGen/WebAssembly/i64.ll b/test/CodeGen/WebAssembly/i64.ll
index 769f74266754b..6dd46a91fad0c 100644
--- a/test/CodeGen/WebAssembly/i64.ll
+++ b/test/CodeGen/WebAssembly/i64.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic 64-bit integer operations assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare i64 @llvm.ctlz.i64(i64, i1)
diff --git a/test/CodeGen/WebAssembly/ident.ll b/test/CodeGen/WebAssembly/ident.ll
index 1e0dc2aa67254..49c188ec2578a 100644
--- a/test/CodeGen/WebAssembly/ident.ll
+++ b/test/CodeGen/WebAssembly/ident.ll
@@ -2,7 +2,7 @@
 
 ; Test llvm.ident.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK: .ident "hello world"
diff --git a/test/CodeGen/WebAssembly/immediates.ll b/test/CodeGen/WebAssembly/immediates.ll
index abab11f2254ea..735b386b4fc0f 100644
--- a/test/CodeGen/WebAssembly/immediates.ll
+++ b/test/CodeGen/WebAssembly/immediates.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic immediates assemble as expected.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: zero_i32:
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index fc066c4b812f8..f35042e64f86d 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -1,8 +1,9 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -no-integrated-as | FileCheck %s
 
-; Test basic inline assembly.
+; Test basic inline assembly. Pass -no-integrated-as since these aren't
+; actually valid assembly syntax.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
index e780b2ee36ca3..5feb2e8c8c75e 100644
--- a/test/CodeGen/WebAssembly/legalize.ll
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -2,7 +2,7 @@
 
 ; Test various types and operators that need to be legalized.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: shl_i3:
diff --git a/test/CodeGen/WebAssembly/load-ext.ll b/test/CodeGen/WebAssembly/load-ext.ll
index 0ffcd38a86660..d52df3361a380 100644
--- a/test/CodeGen/WebAssembly/load-ext.ll
+++ b/test/CodeGen/WebAssembly/load-ext.ll
@@ -2,7 +2,7 @@
 
 ; Test that extending loads are assembled properly.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sext_i8_i32:
diff --git a/test/CodeGen/WebAssembly/load-store-i1.ll b/test/CodeGen/WebAssembly/load-store-i1.ll
index 37b5147294793..47e2e8cb254f8 100644
--- a/test/CodeGen/WebAssembly/load-store-i1.ll
+++ b/test/CodeGen/WebAssembly/load-store-i1.ll
@@ -2,7 +2,7 @@
 
 ; Test that i1 extending loads and truncating stores are assembled properly.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: load_u_i1_i32:
diff --git a/test/CodeGen/WebAssembly/load.ll b/test/CodeGen/WebAssembly/load.ll
index aa8ae689e0d11..243fa9d50ad64 100644
--- a/test/CodeGen/WebAssembly/load.ll
+++ b/test/CodeGen/WebAssembly/load.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic loads are assembled properly.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: ldi32:
diff --git a/test/CodeGen/WebAssembly/loop-idiom.ll b/test/CodeGen/WebAssembly/loop-idiom.ll
index 2906df20a2290..2a233c4069001 100644
--- a/test/CodeGen/WebAssembly/loop-idiom.ll
+++ b/test/CodeGen/WebAssembly/loop-idiom.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -loop-idiom -S < %s -march=wasm32 | FileCheck %s
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 
diff --git a/test/CodeGen/WebAssembly/memory-addr32.ll b/test/CodeGen/WebAssembly/memory-addr32.ll
index e2dd556bddc05..e6c15633fd630 100644
--- a/test/CodeGen/WebAssembly/memory-addr32.ll
+++ b/test/CodeGen/WebAssembly/memory-addr32.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic memory operations assemble as expected with 32-bit addresses.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare i32 @llvm.wasm.memory.size.i32() nounwind readonly
diff --git a/test/CodeGen/WebAssembly/memory-addr64.ll b/test/CodeGen/WebAssembly/memory-addr64.ll
index 5de1f2b11cfda..d504c277f306d 100644
--- a/test/CodeGen/WebAssembly/memory-addr64.ll
+++ b/test/CodeGen/WebAssembly/memory-addr64.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic memory operations assemble as expected with 64-bit addresses.
 
-target datalayout = "e-p:64:64-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
 target triple = "wasm64-unknown-unknown"
 
 declare i64 @llvm.wasm.memory.size.i64() nounwind readonly
diff --git a/test/CodeGen/WebAssembly/offset-folding.ll b/test/CodeGen/WebAssembly/offset-folding.ll
index 2b4e8a90b0f0d..159a25eba3580 100644
--- a/test/CodeGen/WebAssembly/offset-folding.ll
+++ b/test/CodeGen/WebAssembly/offset-folding.ll
@@ -2,7 +2,7 @@
 
 ; Test that constant offsets can be folded into global addresses.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; FIXME: make this 'external' and make sure it still works. WebAssembly
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
index 901801d7dbbe2..828f40206a962 100644
--- a/test/CodeGen/WebAssembly/offset.ll
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -2,7 +2,7 @@
 
 ; Test constant load and store address offsets.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; With an nuw add, we can fold an offset.
diff --git a/test/CodeGen/WebAssembly/phi.ll b/test/CodeGen/WebAssembly/phi.ll
index bae8a7c9e3b85..00e5859b75cf8 100644
--- a/test/CodeGen/WebAssembly/phi.ll
+++ b/test/CodeGen/WebAssembly/phi.ll
@@ -2,7 +2,7 @@
 
 ; Test that phis are lowered.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; Basic phi triangle.
@@ -25,7 +25,7 @@ done:
 ; Swap phis.
 
 ; CHECK-LABEL: test1:
-; CHECK: BB1_1:
+; CHECK: .LBB1_1:
 ; CHECK: copy_local $[[NUM0:[0-9]+]]=, $[[NUM1:[0-9]+]]{{$}}
 ; CHECK: copy_local $[[NUM1]]=, $[[NUM2:[0-9]+]]{{$}}
 ; CHECK: copy_local $[[NUM2]]=, $[[NUM0]]{{$}}
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index 1c1b1e193f7af..f8cae7f924046 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -2,7 +2,7 @@
 
 ; Test the register stackifier pass.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; No because of pointer aliasing.
@@ -55,7 +55,7 @@ define i32 @yes1(i32* %q) {
 ; CHECK-NEXT: .local i32, i32{{$}}
 ; CHECK-NEXT: i32.const   $5=, 2{{$}}
 ; CHECK-NEXT: i32.const   $4=, 1{{$}}
-; CHECK-NEXT: block       BB4_2{{$}}
+; CHECK-NEXT: block{{$}}
 ; CHECK-NEXT: i32.lt_s    $push0=, $0, $4{{$}}
 ; CHECK-NEXT: i32.lt_s    $push1=, $1, $5{{$}}
 ; CHECK-NEXT: i32.xor     $push4=, $pop0, $pop1{{$}}
@@ -64,10 +64,11 @@ define i32 @yes1(i32* %q) {
 ; CHECK-NEXT: i32.xor     $push5=, $pop2, $pop3{{$}}
 ; CHECK-NEXT: i32.xor     $push6=, $pop4, $pop5{{$}}
 ; CHECK-NEXT: i32.ne      $push7=, $pop6, $4{{$}}
-; CHECK-NEXT: br_if       $pop7, BB4_2{{$}}
+; CHECK-NEXT: br_if       $pop7, 0{{$}}
 ; CHECK-NEXT: i32.const   $push8=, 0{{$}}
 ; CHECK-NEXT: return      $pop8{{$}}
-; CHECK-NEXT: BB4_2:
+; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: end_block{{$}}
 ; CHECK-NEXT: return      $4{{$}}
 define i32 @stack_uses(i32 %x, i32 %y, i32 %z, i32 %w) {
 entry:
@@ -89,16 +90,17 @@ false:
 ; be trivially stackified.
 
 ; CHECK-LABEL: multiple_uses:
-; CHECK-NEXT: .param      i32, i32, i32{{$}}
-; CHECK-NEXT: .local      i32{{$}}
+; CHECK-NEXT: .param       i32, i32, i32{{$}}
+; CHECK-NEXT: .local       i32{{$}}
 ; CHECK-NEXT: i32.load    $3=, 0($2){{$}}
-; CHECK-NEXT: block       BB5_3{{$}}
+; CHECK-NEXT: block{{$}}
 ; CHECK-NEXT: i32.ge_u    $push0=, $3, $1{{$}}
-; CHECK-NEXT: br_if       $pop0, BB5_3{{$}}
+; CHECK-NEXT: br_if       $pop0, 0{{$}}
 ; CHECK-NEXT: i32.lt_u    $push1=, $3, $0{{$}}
-; CHECK-NEXT: br_if       $pop1, BB5_3{{$}}
+; CHECK-NEXT: br_if       $pop1, 0{{$}}
 ; CHECK-NEXT: i32.store   $discard=, 0($2), $3{{$}}
-; CHECK-NEXT: BB5_3:
+; CHECK-NEXT: .LBB5_3:
+; CHECK-NEXT: end_block{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @multiple_uses(i32* %arg0, i32* %arg1, i32* %arg2) nounwind {
 bb:
diff --git a/test/CodeGen/WebAssembly/return-int32.ll b/test/CodeGen/WebAssembly/return-int32.ll
index 663cef4e459d2..a93a0f6c438bd 100644
--- a/test/CodeGen/WebAssembly/return-int32.ll
+++ b/test/CodeGen/WebAssembly/return-int32.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -asm-verbose=false | FileCheck %s
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: return_i32:
diff --git a/test/CodeGen/WebAssembly/return-void.ll b/test/CodeGen/WebAssembly/return-void.ll
index 4933bfcb87e66..65ff5f3257194 100644
--- a/test/CodeGen/WebAssembly/return-void.ll
+++ b/test/CodeGen/WebAssembly/return-void.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -asm-verbose=false | FileCheck %s
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: return_void:
diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll
index e208e198c73dc..9c892bb3eceae 100644
--- a/test/CodeGen/WebAssembly/returned.ll
+++ b/test/CodeGen/WebAssembly/returned.ll
@@ -2,14 +2,14 @@
 
 ; Test that the "returned" attribute is optimized effectively.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: _Z3foov:
 ; CHECK-NEXT: .result   i32{{$}}
 ; CHECK-NEXT: i32.const $push0=, 1{{$}}
-; CHECK-NEXT: {{^}} i32.call      $push1=, _Znwm, $pop0{{$}}
-; CHECK-NEXT: {{^}} i32.call      $push2=, _ZN5AppleC1Ev, $pop1{{$}}
+; CHECK-NEXT: {{^}} i32.call      $push1=, _Znwm@FUNCTION, $pop0{{$}}
+; CHECK-NEXT: {{^}} i32.call      $push2=, _ZN5AppleC1Ev@FUNCTION, $pop1{{$}}
 ; CHECK-NEXT: return    $pop2{{$}}
 %class.Apple = type { i8 }
 declare noalias i8* @_Znwm(i32)
@@ -25,7 +25,7 @@ entry:
 ; CHECK-LABEL: _Z3barPvS_l:
 ; CHECK-NEXT: .param   i32, i32, i32{{$}}
 ; CHECK-NEXT: .result  i32{{$}}
-; CHECK-NEXT: {{^}} i32.call     $push0=, memcpy, $0, $1, $2{{$}}
+; CHECK-NEXT: {{^}} i32.call     $push0=, memcpy@FUNCTION, $0, $1, $2{{$}}
 ; CHECK-NEXT: return   $pop0{{$}}
 declare i8* @memcpy(i8* returned, i8*, i32)
 define i8* @_Z3barPvS_l(i8* %p, i8* %s, i32 %n) {
@@ -38,7 +38,7 @@ entry:
 
 ; CHECK-LABEL: test_constant_arg:
 ; CHECK-NEXT: i32.const   $push0=, global{{$}}
-; CHECK-NEXT: {{^}} i32.call        $discard=, returns_arg, $pop0{{$}}
+; CHECK-NEXT: {{^}} i32.call        $discard=, returns_arg@FUNCTION, $pop0{{$}}
 ; CHECK-NEXT: return{{$}}
 @global = external global i32
 @addr = global i32* @global
diff --git a/test/CodeGen/WebAssembly/select.ll b/test/CodeGen/WebAssembly/select.ll
index 1b1d7aed7154f..416f58cac0d3a 100644
--- a/test/CodeGen/WebAssembly/select.ll
+++ b/test/CodeGen/WebAssembly/select.ll
@@ -3,7 +3,7 @@
 
 ; Test that wasm select instruction is selected from LLVM select instruction.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: select_i32_bool:
diff --git a/test/CodeGen/WebAssembly/signext-zeroext.ll b/test/CodeGen/WebAssembly/signext-zeroext.ll
index 40d49af0ccc7a..f6f56363c1af1 100644
--- a/test/CodeGen/WebAssembly/signext-zeroext.ll
+++ b/test/CodeGen/WebAssembly/signext-zeroext.ll
@@ -2,7 +2,7 @@
 
 ; Test zeroext and signext ABI keywords
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: z2s_func:
@@ -32,7 +32,7 @@ define zeroext i8 @s2z_func(i8 signext %t) {
 ; CHECK-NEXT: .result i32{{$}}
 ; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 255{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
-; CHECK-NEXT: call $push[[NUM2:[0-9]+]]=, z2s_func, $pop[[NUM1]]{{$}}
+; CHECK-NEXT: call $push[[NUM2:[0-9]+]]=, z2s_func@FUNCTION, $pop[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @z2s_call(i32 %t) {
   %s = trunc i32 %t to i8
@@ -48,7 +48,7 @@ define i32 @z2s_call(i32 %t) {
 ; CHECK-NEXT: i32.const $[[NUM0:[0-9]+]]=, 24{{$}}
 ; CHECK-NEXT: i32.shl $push[[NUM1:[0-9]+]]=, $0, $[[NUM0]]{{$}}
 ; CHECK-NEXT: i32.shr_s $push[[NUM2:[0-9]+]]=, $pop[[NUM1]], $[[NUM0]]{{$}}
-; CHECK-NEXT: call $push[[NUM3:[0-9]]]=, s2z_func, $pop[[NUM2]]{{$}}
+; CHECK-NEXT: call $push[[NUM3:[0-9]]]=, s2z_func@FUNCTION, $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.shl $push[[NUM4:[0-9]+]]=, $pop[[NUM3]], $[[NUM0]]{{$}}
 ; CHECK-NEXT: i32.shr_s $push[[NUM5:[0-9]+]]=, $pop[[NUM4]], $[[NUM0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM5]]{{$}}
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
index 73479e544db94..ae74133fe3869 100644
--- a/test/CodeGen/WebAssembly/store-results.ll
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -3,7 +3,7 @@
 ; Test that the wasm-store-results pass makes users of stored values use the
 ; result of store expressions to reduce get_local/set_local traffic.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: single_block:
diff --git a/test/CodeGen/WebAssembly/store-trunc.ll b/test/CodeGen/WebAssembly/store-trunc.ll
index c12b716dfd59c..d069af1da7bcb 100644
--- a/test/CodeGen/WebAssembly/store-trunc.ll
+++ b/test/CodeGen/WebAssembly/store-trunc.ll
@@ -2,7 +2,7 @@
 
 ; Test that truncating stores are assembled properly.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: trunc_i8_i32:
diff --git a/test/CodeGen/WebAssembly/store.ll b/test/CodeGen/WebAssembly/store.ll
index 442caedef3a7d..dc93ebbbadb40 100644
--- a/test/CodeGen/WebAssembly/store.ll
+++ b/test/CodeGen/WebAssembly/store.ll
@@ -2,7 +2,7 @@
 
 ; Test that basic stores are assembled properly.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sti32:
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
index 7f6f6efff7d60..3df5e7f9cf6fe 100644
--- a/test/CodeGen/WebAssembly/switch.ll
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -3,7 +3,7 @@
 ; Test switch instructions. Block placement is disabled because it reorders
 ; the blocks in a way that isn't interesting here.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare void @foo0()
@@ -14,27 +14,27 @@ declare void @foo4()
 declare void @foo5()
 
 ; CHECK-LABEL: bar32:
-; CHECK: block BB0_8{{$}}
-; CHECK: block BB0_7{{$}}
-; CHECK: block BB0_6{{$}}
-; CHECK: block BB0_5{{$}}
-; CHECK: block BB0_4{{$}}
-; CHECK: block BB0_3{{$}}
-; CHECK: block BB0_2{{$}}
-; CHECK: tableswitch {{[^,]*}}, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_5, BB0_6, BB0_7{{$}}
-; CHECK: BB0_2:
-; CHECK:   call foo0
-; CHECK: BB0_3:
-; CHECK:   call foo1
-; CHECK: BB0_4:
-; CHECK:   call foo2
-; CHECK: BB0_5:
-; CHECK:   call foo3
-; CHECK: BB0_6:
-; CHECK:   call foo4
-; CHECK: BB0_7:
-; CHECK:   call foo5
-; CHECK: BB0_8:
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: tableswitch {{[^,]*}}, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5{{$}}
+; CHECK: .LBB0_2:
+; CHECK:   call foo0@FUNCTION{{$}}
+; CHECK: .LBB0_3:
+; CHECK:   call foo1@FUNCTION{{$}}
+; CHECK: .LBB0_4:
+; CHECK:   call foo2@FUNCTION{{$}}
+; CHECK: .LBB0_5:
+; CHECK:   call foo3@FUNCTION{{$}}
+; CHECK: .LBB0_6:
+; CHECK:   call foo4@FUNCTION{{$}}
+; CHECK: .LBB0_7:
+; CHECK:   call foo5@FUNCTION{{$}}
+; CHECK: .LBB0_8:
 ; CHECK:   return{{$}}
 define void @bar32(i32 %n) {
 entry:
@@ -94,27 +94,27 @@ sw.epilog:                                        ; preds = %entry, %sw.bb.5, %s
 }
 
 ; CHECK-LABEL: bar64:
-; CHECK: block BB1_8{{$}}
-; CHECK: block BB1_7{{$}}
-; CHECK: block BB1_6{{$}}
-; CHECK: block BB1_5{{$}}
-; CHECK: block BB1_4{{$}}
-; CHECK: block BB1_3{{$}}
-; CHECK: block BB1_2{{$}}
-; CHECK: tableswitch {{[^,]*}}, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_5, BB1_6, BB1_7{{$}}
-; CHECK: BB1_2:
-; CHECK:   call foo0
-; CHECK: BB1_3:
-; CHECK:   call foo1
-; CHECK: BB1_4:
-; CHECK:   call foo2
-; CHECK: BB1_5:
-; CHECK:   call foo3
-; CHECK: BB1_6:
-; CHECK:   call foo4
-; CHECK: BB1_7:
-; CHECK:   call foo5
-; CHECK: BB1_8:
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: block{{$}}
+; CHECK: tableswitch {{[^,]*}}, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5{{$}}
+; CHECK: .LBB1_2:
+; CHECK:   call foo0@FUNCTION{{$}}
+; CHECK: .LBB1_3:
+; CHECK:   call foo1@FUNCTION{{$}}
+; CHECK: .LBB1_4:
+; CHECK:   call foo2@FUNCTION{{$}}
+; CHECK: .LBB1_5:
+; CHECK:   call foo3@FUNCTION{{$}}
+; CHECK: .LBB1_6:
+; CHECK:   call foo4@FUNCTION{{$}}
+; CHECK: .LBB1_7:
+; CHECK:   call foo5@FUNCTION{{$}}
+; CHECK: .LBB1_8:
 ; CHECK:   return{{$}}
 define void @bar64(i64 %n) {
 entry:
diff --git a/test/CodeGen/WebAssembly/unreachable.ll b/test/CodeGen/WebAssembly/unreachable.ll
index 414767e5c35df..7b23bf3cecfb6 100644
--- a/test/CodeGen/WebAssembly/unreachable.ll
+++ b/test/CodeGen/WebAssembly/unreachable.ll
@@ -4,7 +4,7 @@
 ; Test that LLVM unreachable instruction and trap intrinsic are lowered to
 ; wasm unreachable
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 declare void @llvm.trap()
@@ -12,7 +12,7 @@ declare void @llvm.debugtrap()
 declare void @abort()
 
 ; CHECK-LABEL: f1:
-; CHECK: call abort
+; CHECK: call abort@FUNCTION{{$}}
 ; CHECK: unreachable
 define i32 @f1() {
   call void @abort()
diff --git a/test/CodeGen/WebAssembly/unused-argument.ll b/test/CodeGen/WebAssembly/unused-argument.ll
index e7851b216cb47..00dea769ee866 100644
--- a/test/CodeGen/WebAssembly/unused-argument.ll
+++ b/test/CodeGen/WebAssembly/unused-argument.ll
@@ -2,7 +2,7 @@
 
 ; Make sure that argument offsets are correct even if some arguments are unused.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: unused_first:
@@ -22,7 +22,7 @@ define i32 @unused_second(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: call_something:
-; CHECK-NEXT: {{^}} i32.call $discard=, return_something{{$}}
+; CHECK-NEXT: {{^}} i32.call $discard=, return_something@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 declare i32 @return_something()
 define void @call_something() {
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index 6e01e36cf9fa4..cc50192b66dbd 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -asm-verbose=false -fast-isel | FileCheck %s
 
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: alloca32:
@@ -72,6 +72,27 @@ define void @allocarray() {
  ret void
 }
 
+define void @allocarray_inbounds() {
+ ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
+ ; CHECK-NEXT: i32.const [[L2:.+]]=, 32
+ ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
+ %r = alloca [5 x i32]
+ ; CHECK: i32.const $push[[L3:.+]]=, 1
+ ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ %p = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 0
+ store i32 1, i32* %p
+ ; This store should have both the GEP and the FI folded into it.
+ ; CHECK-NEXT: i32.store {{.*}}=, 16([[SP]]), $pop
+ %p2 = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 1
+ store i32 1, i32* %p2
+ ; CHECK: i32.const [[L7:.+]]=, 32
+ ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L7]]
+ ; CHECK-NEXT: i32.const [[L8:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.store [[SP]]=, 0([[L7]]), [[SP]]
+ ret void
+}
+
 define void @dynamic_alloca(i32 %alloc) {
  ; TODO: Support frame pointers
  ;%r = alloca i32, i32 %alloc
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
index c564d94207424..c12264625c374 100644
--- a/test/CodeGen/WebAssembly/varargs.ll
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -2,7 +2,7 @@
 
 ; Test varargs constructs.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; Test va_start.
@@ -103,7 +103,7 @@ entry:
 declare void @callee(...)
 
 ; CHECK-LABEL: caller_none:
-; CHECK-NEXT: call callee{{$}}
+; CHECK-NEXT: call callee@FUNCTION{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @caller_none() {
   call void (...) @callee()
diff --git a/test/CodeGen/WebAssembly/vtable.ll b/test/CodeGen/WebAssembly/vtable.ll
index 38298bc474b53..739ba2aaf5a58 100644
--- a/test/CodeGen/WebAssembly/vtable.ll
+++ b/test/CodeGen/WebAssembly/vtable.ll
@@ -11,7 +11,7 @@
 ;   struct D : public B;
 ; Each with a virtual dtor and method foo.
 
-target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 %struct.A = type { i32 (...)** }
diff --git a/test/CodeGen/WinEH/wineh-cloning.ll b/test/CodeGen/WinEH/wineh-cloning.ll
index 3c1793a3bd7f8..748c07df17300 100644
--- a/test/CodeGen/WinEH/wineh-cloning.ll
+++ b/test/CodeGen/WinEH/wineh-cloning.ll
@@ -233,48 +233,6 @@ exit:
 ; CHECK-NEXT:   br label %outer.ret
 
 
-define void @test9() personality i32 (...)* @__C_specific_handler {
-entry:
-  invoke void @f()
-    to label %invoke.cont unwind label %left
-invoke.cont:
-  invoke void @f()
-    to label %unreachable unwind label %right
-left:
-  %cp.left = cleanuppad within none []
-  call void @llvm.foo(i32 1)
-  invoke void @f() [ "funclet"(token %cp.left) ]
-    to label %unreachable unwind label %right
-right:
-  %cp.right = cleanuppad within none []
-  call void @llvm.foo(i32 2)
-  invoke void @f() [ "funclet"(token %cp.right) ]
-    to label %unreachable unwind label %left
-unreachable:
-  unreachable
-}
-; This is an irreducible loop with two funclets that enter each other.
-; CHECK-LABEL: define void @test9(
-; CHECK:     entry:
-; CHECK:               to label %invoke.cont unwind label %[[LEFT:.+]]
-; CHECK:     invoke.cont:
-; CHECK:               to label %[[UNREACHABLE_ENTRY:.+]] unwind label %[[RIGHT:.+]]
-; CHECK:     [[LEFT]]:
-; CHECK:       call void @llvm.foo(i32 1)
-; CHECK:       invoke void @f()
-; CHECK:               to label %[[UNREACHABLE_LEFT:.+]] unwind label %[[RIGHT]]
-; CHECK:     [[RIGHT]]:
-; CHECK:       call void @llvm.foo(i32 2)
-; CHECK:       invoke void @f()
-; CHECK:               to label %[[UNREACHABLE_RIGHT:.+]] unwind label %[[LEFT]]
-; CHECK:     [[UNREACHABLE_RIGHT]]:
-; CHECK:       unreachable
-; CHECK:     [[UNREACHABLE_LEFT]]:
-; CHECK:       unreachable
-; CHECK:     [[UNREACHABLE_ENTRY]]:
-; CHECK:       unreachable
-
-
 define void @test10() personality i32 (...)* @__CxxFrameHandler3 {
 entry:
   invoke void @f()
diff --git a/test/CodeGen/WinEH/wineh-no-demotion.ll b/test/CodeGen/WinEH/wineh-no-demotion.ll
index 4fb84db890930..0901e27c301d0 100644
--- a/test/CodeGen/WinEH/wineh-no-demotion.ll
+++ b/test/CodeGen/WinEH/wineh-no-demotion.ll
@@ -33,7 +33,7 @@ right:
 
 shared:
   %x = call i32 @g()
-  invoke void @f() [ "funclet"(token %0) ]
+  invoke void @f()
           to label %shared.cont unwind label %inner
 
 shared.cont:
@@ -72,7 +72,7 @@ right:
 
 shared:
   %x = call i32 @g()
-  invoke void @f() [ "funclet"(token %0) ]
+  invoke void @f()
           to label %shared.cont unwind label %inner
 
 shared.cont:
diff --git a/test/CodeGen/WinEH/wineh-statenumbering.ll b/test/CodeGen/WinEH/wineh-statenumbering.ll
index dab7fde61a667..4e7c36943a016 100644
--- a/test/CodeGen/WinEH/wineh-statenumbering.ll
+++ b/test/CodeGen/WinEH/wineh-statenumbering.ll
@@ -44,7 +44,7 @@ catch:                                            ; preds = %catch.dispatch
   ; CHECK: catch:
   ; CHECK:   store i32 2
   ; CHECK:   invoke void @_CxxThrowException(
-  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) #1
+  invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) [ "funclet"(token %1) ]
           to label %unreachable unwind label %catch.dispatch.1
 
 catch.dispatch.1:                                 ; preds = %catch
diff --git a/test/CodeGen/X86/2008-11-03-F80VAARG.ll b/test/CodeGen/X86/2008-11-03-F80VAARG.ll
index 507799b7304fb..97c046c864261 100644
--- a/test/CodeGen/X86/2008-11-03-F80VAARG.ll
+++ b/test/CodeGen/X86/2008-11-03-F80VAARG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -o - | not grep 10
+; RUN: llc < %s -march=x86 -o - | FileCheck %s
 
 declare void @llvm.va_start(i8*) nounwind
 
@@ -6,6 +6,8 @@ declare void @llvm.va_copy(i8*, i8*) nounwind
 
 declare void @llvm.va_end(i8*) nounwind
 
+; CHECK-LABEL: test:
+; CHECK-NOT: 10
 define x86_fp80 @test(...) nounwind {
 	%ap = alloca i8*		; <i8**> [#uses=3]
 	%v1 = bitcast i8** %ap to i8*		; <i8*> [#uses=1]
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 92ec107a00794..6950641a08aea 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -3,9 +3,7 @@
 define void @endless_loop() {
 ; CHECK-LABEL: endless_loop:
 ; CHECK-NEXT:  # BB#0:
-; CHECK-NEXT:    vmovaps (%eax), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:    vbroadcastss (%eax), %ymm0
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 86b0628aa0bc6..0c92f4884fb73 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1,11 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
 
 define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: A:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: A:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    vmovd %ecx, %xmm0
+; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: A:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -16,10 +29,16 @@ entry:
 }
 
 define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: B:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: B:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: B:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
@@ -30,10 +49,16 @@ entry:
 }
 
 define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: C:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: C:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: C:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 8
   %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
@@ -44,10 +69,16 @@ entry:
 }
 
 define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: D:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: D:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: D:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
@@ -60,10 +91,16 @@ entry:
 ;;;; 128-bit versions
 
 define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: e:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: e:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: e:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -75,10 +112,15 @@ entry:
 
 ; Don't broadcast constants on pre-AVX2 hardware.
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _e2:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
-; CHECK-NEXT:    retq
+; X32-LABEL: _e2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
+; X32-NEXT:    retl
+;
+; X64-LABEL: _e2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
+; X64-NEXT:    retq
 entry:
    %vecinit.i = insertelement <4 x float> undef, float       0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
@@ -89,10 +131,16 @@ entry:
 
 
 define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: F:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: F:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: F:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
@@ -105,10 +153,16 @@ entry:
 ; FIXME: Pointer adjusted broadcasts
 
 define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4i32_4i32_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4i32_4i32_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4i32_4i32_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -116,11 +170,18 @@ entry:
 }
 
 define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8i32_4i32_33333333:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8i32_4i32_33333333:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8i32_4i32_33333333:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -128,13 +189,16 @@ entry:
 }
 
 define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8i32_8i32_55555555:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8i32_8i32_55555555:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 20(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8i32_8i32_55555555:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <8 x i32>, <8 x i32>* %ptr
   %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -142,10 +206,16 @@ entry:
 }
 
 define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4f32_4f32_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4f32_4f32_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 4(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4f32_4f32_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
   %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -153,10 +223,16 @@ entry:
 }
 
 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8f32_4f32_33333333:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8f32_4f32_33333333:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 12(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8f32_4f32_33333333:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
   %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -164,10 +240,16 @@ entry:
 }
 
 define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8f32_8f32_55555555:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8f32_8f32_55555555:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 20(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8f32_8f32_55555555:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <8 x float>, <8 x float>* %ptr
   %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -175,10 +257,16 @@ entry:
 }
 
 define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_2i64_2i64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_2i64_2i64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_2i64_2i64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x i64>, <2 x i64>* %ptr
   %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
@@ -186,12 +274,20 @@ entry:
 }
 
 define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4i64_2i64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4i64_2i64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovaps (%eax), %xmm0
+; X32-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4i64_2i64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x i64>, <2 x i64>* %ptr
   %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -199,13 +295,16 @@ entry:
 }
 
 define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4i64_4i64_2222:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovapd (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4i64_4i64_2222:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd 16(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4i64_4i64_2222:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x i64>, <4 x i64>* %ptr
   %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
@@ -213,11 +312,18 @@ entry:
 }
 
 define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_2f64_2f64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_2f64_2f64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovaps (%eax), %xmm0
+; X32-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_2f64_2f64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x double>, <2 x double>* %ptr
   %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -225,10 +331,16 @@ entry:
 }
 
 define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4f64_2f64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4f64_2f64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd 8(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4f64_2f64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x double>, <2 x double>* %ptr
   %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -236,10 +348,16 @@ entry:
 }
 
 define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4f64_4f64_2222:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4f64_4f64_2222:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd 16(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4f64_4f64_2222:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x double>, <4 x double>* %ptr
   %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
@@ -249,11 +367,22 @@ entry:
 ; Unsupported vbroadcasts
 
 define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: G:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    retq
+; X32-LABEL: G:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    vmovd %ecx, %xmm0
+; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: G:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
@@ -262,20 +391,31 @@ entry:
 }
 
 define <4 x i32> @H(<4 x i32> %a) {
-; CHECK-LABEL: H:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-NEXT:    retq
+; X32-LABEL: H:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: H:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    retq
 entry:
   %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   ret <4 x i32> %x
 }
 
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: I:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT:    retq
+; X32-LABEL: I:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: I:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 4
   %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
@@ -284,12 +424,21 @@ entry:
 }
 
 define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _RR:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT:    movl (%rsi), %eax
-; CHECK-NEXT:    movl %eax, (%rax)
-; CHECK-NEXT:    retq
+; X32-LABEL: _RR:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    vbroadcastss (%ecx), %xmm0
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    movl %eax, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: _RR:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %xmm0
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    movl %eax, (%rax)
+; X64-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -303,10 +452,16 @@ entry:
 }
 
 define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _RR2:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _RR2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _RR2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %v = insertelement <4 x float> undef, float %q, i32 0
@@ -319,10 +474,16 @@ entry:
 ; (via the insertelements).
 
 define <8 x float> @splat_concat1(float* %p) {
-; CHECK-LABEL: splat_concat1:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat1:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat1:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss (%rdi), %ymm0
+; X64-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -333,10 +494,16 @@ define <8 x float> @splat_concat1(float* %p) {
 }
 
 define <8 x float> @splat_concat2(float* %p) {
-; CHECK-LABEL: splat_concat2:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat2:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat2:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss (%rdi), %ymm0
+; X64-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -351,10 +518,16 @@ define <8 x float> @splat_concat2(float* %p) {
 }
 
 define <4 x double> @splat_concat3(double* %p) {
-; CHECK-LABEL: splat_concat3:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat3:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat3:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
+; X64-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
@@ -363,10 +536,16 @@ define <4 x double> @splat_concat3(double* %p) {
 }
 
 define <4 x double> @splat_concat4(double* %p) {
-; CHECK-LABEL: splat_concat4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat4:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat4:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
+; X64-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 6b77edb155a41..8fd50ae3015df 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1,11 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
 
 define <16 x i8> @BB16(i8* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: BB16:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastb (%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: BB16:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastb (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: BB16:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastb (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %q = load i8, i8* %ptr, align 4
   %q0 = insertelement <16 x i8> undef, i8 %q, i32 0
@@ -28,10 +35,16 @@ entry:
 }
 
 define <32 x i8> @BB32(i8* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: BB32:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastb (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: BB32:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastb (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: BB32:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastb (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load i8, i8* %ptr, align 4
   %q0 = insertelement <32 x i8> undef, i8 %q, i32 0
@@ -71,10 +84,16 @@ entry:
 }
 
 define <8 x i16> @W16(i16* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: W16:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastw (%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: W16:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastw (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: W16:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastw (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %q = load i16, i16* %ptr, align 4
   %q0 = insertelement <8 x i16> undef, i16 %q, i32 0
@@ -89,10 +108,16 @@ entry:
 }
 
 define <16 x i16> @WW16(i16* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: WW16:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastw (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: WW16:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastw (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: WW16:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastw (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load i16, i16* %ptr, align 4
   %q0 = insertelement <16 x i16> undef, i16 %q, i32 0
@@ -115,10 +140,16 @@ entry:
 }
 
 define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: D32:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: D32:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: D32:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %q0 = insertelement <4 x i32> undef, i32 %q, i32 0
@@ -129,10 +160,16 @@ entry:
 }
 
 define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: DD32:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: DD32:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: DD32:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %q0 = insertelement <8 x i32> undef, i32 %q, i32 0
@@ -147,10 +184,21 @@ entry:
 }
 
 define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: Q64:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastq (%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: Q64:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    vmovd %ecx, %xmm0
+; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: Q64:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastq (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 4
   %q0 = insertelement <2 x i64> undef, i64 %q, i32 0
@@ -159,10 +207,22 @@ entry:
 }
 
 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: QQ64:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: QQ64:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    vmovd %ecx, %xmm0
+; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: QQ64:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 4
   %q0 = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -175,10 +235,16 @@ entry:
 ; FIXME: Pointer adjusted broadcasts
 
 define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_16i8_16i8_1111111111111111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastb 1(%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_16i8_16i8_1111111111111111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastb 1(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_16i8_16i8_1111111111111111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastb 1(%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -186,10 +252,16 @@ entry:
 }
 
 define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastb 1(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastb 1(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -197,10 +269,16 @@ entry:
 }
 
 define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastb 1(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastb 1(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <32 x i8>, <32 x i8>* %ptr
   %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -208,10 +286,16 @@ entry:
 }
 
 define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8i16_8i16_11111111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8i16_8i16_11111111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastw 2(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8i16_8i16_11111111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastw 2(%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -219,10 +303,16 @@ entry:
 }
 
 define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_16i16_8i16_1111111111111111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_16i16_8i16_1111111111111111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastw 2(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_16i16_8i16_1111111111111111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -230,10 +320,16 @@ entry:
 }
 
 define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_16i16_16i16_1111111111111111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_16i16_16i16_1111111111111111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpbroadcastw 2(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_16i16_16i16_1111111111111111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastw 2(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -241,10 +337,16 @@ entry:
 }
 
 define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4i32_4i32_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4i32_4i32_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 4(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4i32_4i32_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -252,10 +354,16 @@ entry:
 }
 
 define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8i32_4i32_33333333:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8i32_4i32_33333333:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 12(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8i32_4i32_33333333:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -263,10 +371,16 @@ entry:
 }
 
 define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8i32_8i32_55555555:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8i32_8i32_55555555:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 20(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8i32_8i32_55555555:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <8 x i32>, <8 x i32>* %ptr
   %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -274,10 +388,16 @@ entry:
 }
 
 define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4f32_4f32_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4f32_4f32_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 4(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4f32_4f32_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 4(%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
   %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -285,10 +405,16 @@ entry:
 }
 
 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8f32_4f32_33333333:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8f32_4f32_33333333:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 12(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8f32_4f32_33333333:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 12(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
   %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -296,10 +422,16 @@ entry:
 }
 
 define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_8f32_8f32_55555555:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_8f32_8f32_55555555:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss 20(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_8f32_8f32_55555555:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss 20(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <8 x float>, <8 x float>* %ptr
   %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -307,10 +439,17 @@ entry:
 }
 
 define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_2i64_2i64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_2i64_2i64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_2i64_2i64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastq 8(%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x i64>, <2 x i64>* %ptr
   %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
@@ -318,10 +457,16 @@ entry:
 }
 
 define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4i64_2i64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4i64_2i64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd 8(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4i64_2i64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x i64>, <2 x i64>* %ptr
   %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -329,10 +474,16 @@ entry:
 }
 
 define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4i64_4i64_2222:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4i64_4i64_2222:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd 16(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4i64_4i64_2222:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x i64>, <4 x i64>* %ptr
   %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
@@ -340,11 +491,18 @@ entry:
 }
 
 define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_2f64_2f64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_2f64_2f64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovaps (%eax), %xmm0
+; X32-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_2f64_2f64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x double>, <2 x double>* %ptr
   %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -352,10 +510,16 @@ entry:
 }
 
 define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4f64_2f64_1111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4f64_2f64_1111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd 8(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4f64_2f64_1111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd 8(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <2 x double>, <2 x double>* %ptr
   %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -363,10 +527,16 @@ entry:
 }
 
 define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: load_splat_4f64_4f64_2222:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: load_splat_4f64_4f64_2222:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastsd 16(%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: load_splat_4f64_4f64_2222:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastsd 16(%rdi), %ymm0
+; X64-NEXT:    retq
 entry:
   %ld = load <4 x double>, <4 x double>* %ptr
   %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
@@ -376,10 +546,16 @@ entry:
 ; make sure that we still don't support broadcast double into 128-bit vector
 ; this used to crash
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: I:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; CHECK-NEXT:    retq
+; X32-LABEL: I:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: I:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 4
   %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
@@ -388,32 +564,49 @@ entry:
 }
 
 define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp {
-; CHECK-LABEL: V111:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
-; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: V111:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vpbroadcastd LCPI27_0, %ymm1
+; X32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: V111:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; X64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %g = add <8 x i32> %in, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i32> %g
 }
 
 define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
-; CHECK-LABEL: V113:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
-; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: V113:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    vbroadcastss LCPI28_0, %ymm1
+; X32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: V113:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
 entry:
   %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
   ret <8 x float> %g
 }
 
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _e2:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _e2:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss LCPI29_0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _e2:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
   %vecinit.i = insertelement <4 x float> undef, float        0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
@@ -422,10 +615,15 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 }
 
 define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _e4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
-; CHECK-NEXT:    retq
+; X32-LABEL: _e4:
+; X32:       ## BB#0:
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
+; X32-NEXT:    retl
+;
+; X64-LABEL: _e4:
+; X64:       ## BB#0:
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
+; X64-NEXT:    retq
   %vecinit0.i = insertelement <8 x i8> undef, i8       52, i32 0
   %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
   %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
@@ -437,19 +635,30 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
   ret <8 x i8> %vecinit7.i
 }
 
-
 define void @crash() nounwind alwaysinline {
-; CHECK-LABEL: crash:
-; CHECK:       ## BB#0: ## %WGLoopsEntry
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je LBB31_1
-; CHECK-NEXT:  ## BB#2: ## %ret
-; CHECK-NEXT:    retq
-; CHECK-NEXT:    .align 4, 0x90
-; CHECK-NEXT:  LBB31_1: ## %footer349VF
-; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    jmp LBB31_1
+; X32-LABEL: crash:
+; X32:       ## BB#0: ## %WGLoopsEntry
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    testb %al, %al
+; X32-NEXT:    je LBB31_1
+; X32-NEXT:  ## BB#2: ## %ret
+; X32-NEXT:    retl
+; X32-NEXT:    .align 4, 0x90
+; X32-NEXT:  LBB31_1: ## %footer349VF
+; X32-NEXT:    ## =>This Inner Loop Header: Depth=1
+; X32-NEXT:    jmp LBB31_1
+;
+; X64-LABEL: crash:
+; X64:       ## BB#0: ## %WGLoopsEntry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    je LBB31_1
+; X64-NEXT:  ## BB#2: ## %ret
+; X64-NEXT:    retq
+; X64-NEXT:    .align 4, 0x90
+; X64-NEXT:  LBB31_1: ## %footer349VF
+; X64-NEXT:    ## =>This Inner Loop Header: Depth=1
+; X64-NEXT:    jmp LBB31_1
 WGLoopsEntry:
   br i1 undef, label %ret, label %footer329VF
 
@@ -477,150 +686,230 @@ ret:
 }
 
 define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _inreg0:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovd %edi, %xmm0
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg0:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg0:
+; X64:       ## BB#0:
+; X64-NEXT:    vmovd %edi, %xmm0
+; X64-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64-NEXT:    retq
   %in = insertelement <8 x i32> undef, i32 %scalar, i32 0
   %wide = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
   ret <8 x i32> %wide
 }
 
 define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _inreg1:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg1:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg1:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64-NEXT:    retq
   %in = insertelement <8 x float> undef, float %scalar, i32 0
   %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer
   ret <8 x float> %wide
 }
 
 define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _inreg2:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg2:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg2:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %xmm0
+; X64-NEXT:    retq
   %in = insertelement <4 x float> undef, float %scalar, i32 0
   %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %wide
 }
 
 define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
-; CHECK-LABEL: _inreg3:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg3:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg3:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
+; X64-NEXT:    retq
   %in = insertelement <4 x double> undef, double %scalar, i32 0
   %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer
   ret <4 x double> %wide
 }
 
 define   <8 x float> @_inreg8xfloat(<8 x float> %a) {
-; CHECK-LABEL: _inreg8xfloat:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg8xfloat:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg8xfloat:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64-NEXT:    retq
   %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
   ret <8 x float> %b
 }
 
 define   <4 x float> @_inreg4xfloat(<4 x float> %a) {
-; CHECK-LABEL: _inreg4xfloat:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg4xfloat:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg4xfloat:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %xmm0
+; X64-NEXT:    retq
   %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %b
 }
 
 define   <16 x i16> @_inreg16xi16(<16 x i16> %a) {
-; CHECK-LABEL: _inreg16xi16:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg16xi16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpbroadcastw %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg16xi16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
+; X64-NEXT:    retq
   %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
   ret <16 x i16> %b
 }
 
 define   <8 x i16> @_inreg8xi16(<8 x i16> %a) {
-; CHECK-LABEL: _inreg8xi16:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg8xi16:
+; X32:       ## BB#0:
+; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg8xi16:
+; X64:       ## BB#0:
+; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
+; X64-NEXT:    retq
   %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %b
 }
 
 define   <4 x i64> @_inreg4xi64(<4 x i64> %a) {
-; CHECK-LABEL: _inreg4xi64:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg4xi64:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg4xi64:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
+; X64-NEXT:    retq
   %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
   ret <4 x i64> %b
 }
 
 define   <2 x i64> @_inreg2xi64(<2 x i64> %a) {
-; CHECK-LABEL: _inreg2xi64:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg2xi64:
+; X32:       ## BB#0:
+; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg2xi64:
+; X64:       ## BB#0:
+; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X64-NEXT:    retq
   %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %b
 }
 
 define   <4 x double> @_inreg4xdouble(<4 x double> %a) {
-; CHECK-LABEL: _inreg4xdouble:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg4xdouble:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg4xdouble:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
+; X64-NEXT:    retq
   %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
   ret <4 x double> %b
 }
 
 define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
-; CHECK-LABEL: _inreg2xdouble:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg2xdouble:
+; X32:       ## BB#0:
+; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg2xdouble:
+; X64:       ## BB#0:
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    retq
   %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %b
 }
 
 define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
-; CHECK-LABEL: _inreg8xi32:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg8xi32:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg8xi32:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64-NEXT:    retq
   %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
   ret <8 x i32> %b
 }
 
 define   <4 x i32> @_inreg4xi32(<4 x i32> %a) {
-; CHECK-LABEL: _inreg4xi32:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg4xi32:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg4xi32:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %xmm0
+; X64-NEXT:    retq
   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %b
 }
 
 define   <32 x i8> @_inreg32xi8(<32 x i8> %a) {
-; CHECK-LABEL: _inreg32xi8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg32xi8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpbroadcastb %xmm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg32xi8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
+; X64-NEXT:    retq
   %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
   ret <32 x i8> %b
 }
 
 define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
-; CHECK-LABEL: _inreg16xi8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X32-LABEL: _inreg16xi8:
+; X32:       ## BB#0:
+; X32-NEXT:    vpbroadcastb %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: _inreg16xi8:
+; X64:       ## BB#0:
+; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
+; X64-NEXT:    retq
   %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %b
 }
@@ -630,10 +919,15 @@ define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
 ; (via the insertelements).
 
 define <8 x float> @splat_concat1(float %f) {
-; CHECK-LABEL: splat_concat1:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat1:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat1:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64-NEXT:    retq
   %1 = insertelement <4 x float> undef, float %f, i32 0
   %2 = insertelement <4 x float> %1, float %f, i32 1
   %3 = insertelement <4 x float> %2, float %f, i32 2
@@ -643,10 +937,15 @@ define <8 x float> @splat_concat1(float %f) {
 }
 
 define <8 x float> @splat_concat2(float %f) {
-; CHECK-LABEL: splat_concat2:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat2:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat2:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastss %xmm0, %ymm0
+; X64-NEXT:    retq
   %1 = insertelement <4 x float> undef, float %f, i32 0
   %2 = insertelement <4 x float> %1, float %f, i32 1
   %3 = insertelement <4 x float> %2, float %f, i32 2
@@ -660,10 +959,15 @@ define <8 x float> @splat_concat2(float %f) {
 }
 
 define <4 x double> @splat_concat3(double %d) {
-; CHECK-LABEL: splat_concat3:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat3:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat3:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
+; X64-NEXT:    retq
   %1 = insertelement <2 x double> undef, double %d, i32 0
   %2 = insertelement <2 x double> %1, double %d, i32 1
   %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -671,10 +975,15 @@ define <4 x double> @splat_concat3(double %d) {
 }
 
 define <4 x double> @splat_concat4(double %d) {
-; CHECK-LABEL: splat_concat4:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT:    retq
+; X32-LABEL: splat_concat4:
+; X32:       ## BB#0:
+; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: splat_concat4:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
+; X64-NEXT:    retq
   %1 = insertelement <2 x double> undef, double %d, i32 0
   %2 = insertelement <2 x double> %1, double %d, i32 1
   %3 = insertelement <2 x double> undef, double %d, i32 0
@@ -805,9 +1114,9 @@ eintry:
   ret void
 }
 
-; CHECK-LABEL: isel_crash_2q
-; CHECK: vpbroadcastq {{[^,]+}}, %xmm{{[0-9]+}}
-; CHECK: ret
+; X64-LABEL: isel_crash_2q
+; X64: vpbroadcastq {{[^,]+}}, %xmm{{[0-9]+}}
+; X64: ret
 define void @isel_crash_2q(i64* %cV_R.addr) {
 entry:
   %__a.addr.i = alloca <2 x i64>, align 16
@@ -823,9 +1132,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: isel_crash_4q
-; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
-; CHECK: ret
+; X64-LABEL: isel_crash_4q
+; X64: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
+; X64: ret
 define void @isel_crash_4q(i64* %cV_R.addr) {
 eintry:
   %__a.addr.i = alloca <4 x i64>, align 16
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5a17cdb292168..7179f742cc66e 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -907,49 +907,79 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask)
 
 declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
 
-define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_maskz_load_aligned_ps:
+define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
-  ret <16 x float> %res
+  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
+  %res4 = fadd <16 x float> %res2, %res1
+  ret <16 x float> %res4
 }
 
 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
 
-define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_maskz_load_aligned_pd:
+define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmovups (%rdi), %zmm0
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    vmovups (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
+  %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
+  %res4 = fadd <16 x float> %res2, %res1
+  ret <16 x float> %res4
 }
 
-declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
+declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
 
-define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_load_aligned_ps:
+define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovapd (%rdi), %zmm0
+; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
-  ret <16 x float> %res
+  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x double> %res2, %res1
+  ret <8 x double> %res4
 }
 
-define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_load_aligned_pd:
+declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
+    
+define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovupd (%rdi), %zmm0
+; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    vmovupd (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
-  ret <8 x double> %res
+  %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x double> %res2, %res1
+  ret <8 x double> %res4
 }
 
-declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
+declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
 
 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_valign_q:
@@ -5731,7 +5761,7 @@ define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8
 ; CHECK-NEXT:    movzbl %dil, %eax
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vgetmantpd $11, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vgetmantpd $11,{sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vgetmantpd $11, {sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
@@ -5747,7 +5777,7 @@ define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <1
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantps $11, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vgetmantps $11,{sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vgetmantps $11, {sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
@@ -5767,7 +5797,7 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d
 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5
-; CHECK-NEXT:    vgetmantsd $11,{sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
 ; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
@@ -5792,7 +5822,7 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vgetmantss $11,{sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -6542,3 +6572,316 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i8 %x1, <8
   ret <8 x i64> %res4
 }
 
+declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i16, <16 x i32>, i8)
+
+define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i16 %x1, <16 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 
+; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm0 
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+	%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 %x3)
+	%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> zeroinitializer, i8 %x3)
+	%res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 -1)
+	%res3 = add <16 x i32> %res, %res1
+	%res4 = add <16 x i32> %res3, %res2
+	ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z} 
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 
+; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1 
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm2 {%k1} 
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z} 
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 
+; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vprold $3, %zmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vprold $3, %zmm0, %zmm0 
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i8, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm0 
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 
+; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm0 
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0 
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0 
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm0 
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 064652aa470d4..7cf6edafbcc82 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2910,15 +2910,15 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <
 declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psrlv32hi:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrlv32hi:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -2930,15 +2930,15 @@ define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16>
 declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -2950,15 +2950,15 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16>
 declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i8, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsraw $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpsraw $3, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
-; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_wi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
@@ -2966,3 +2966,164 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <
   %res4 = add <32 x i16> %res3, %res2
   ret <32 x i16> %res4
 }
+
+declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i8, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpshufhw $3, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpshufhw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vpshufhw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i8, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1 
+; AVX512BW-NEXT:    vpshuflw $3, %zmm0, %zmm1 {%k1} 
+; AVX512BW-NEXT:    vpshuflw $3, %zmm0, %zmm2 {%k1} {z} 
+; AVX512BW-NEXT:    vpshuflw $3, %zmm0, %zmm0 
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1 
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 
+; AVX512BW-NEXT:    retq 
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_wi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psllv32hi:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1 
+; AVX512BW-NEXT:    vpmovzxbw %ymm0, %zmm1 {%k1} 
+; AVX512BW-NEXT:    vpmovzxbw %ymm0, %zmm2 {%k1} {z} 
+; AVX512BW-NEXT:    vpmovzxbw %ymm0, %zmm0 
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1 
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 
+; AVX512BW-NEXT:    retq 
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+
+
+declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1 
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1 {%k1} 
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm2 {%k1} {z} 
+; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0 
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1 
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 
+; AVX512BW-NEXT:    retq 
+  %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
+  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
+  %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
+  %res3 = add <32 x i16> %res, %res1
+  %res4 = add <32 x i16> %res3, %res2
+  ret <32 x i16> %res4
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 6b2cb432f1cdc..4cbb9ba6c56ac 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -4714,3 +4714,424 @@ define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i8 %x1, <
   %res4 = add <16 x i16> %res3, %res2
   ret <16 x i16> %res4
 }
+
+declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i16, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i16 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpshufd $3, %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpshufd $3, %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpshufd $3, %xmm0, %xmm0 
+; CHECK-NEXT:    ## xmm0 = xmm0[3,0,0,0]
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+	%res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> %x2, i8 %x3)
+	%res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> zeroinitializer, i8 %x3)
+	%res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> %x2, i8 -1)
+	%res3 = add <4 x i32> %res, %res1
+	%res4 = add <4 x i32> %res3, %res2
+	ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i16, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i16 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpshufd $3, %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpshufd $3, %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpshufd $3, %ymm0, %ymm0 
+; CHECK-NEXT:    ## ymm0 = ymm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+	%res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> %x2, i8 %x3)
+	%res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> zeroinitializer, i8 %x3)
+	%res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> %x2, i8 -1)
+	%res3 = add <8 x i32> %res, %res1
+	%res4 = add <8 x i32> %res3, %res2
+	ret <8 x i32> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i8, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm0
+; CHECK-NEXT:    ## xmm0 = xmm0[0,1,2,3,7,4,4,4]
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i8, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpshufhw $3, %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpshufhw $3, %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpshufhw $3, %ymm0, %ymm0 
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i8, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm0 
+; CHECK-NEXT:    ## xmm0 = xmm0[3,0,0,0,4,5,6,7]
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i8, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 
+; CHECK-NEXT:    vpshuflw $3, %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpshuflw $3, %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpshuflw $3, %ymm0, %ymm0 
+; CHECK-NEXT:    ## ymm0 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav16_hi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsravw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsravw %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+
+declare <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16>, <8 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psllv16_hi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsllvw %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 
+; CHECK-NEXT:    vpmovzxbw %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbw %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbw %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1)
+  %res3 = add <8 x i16> %res, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1)
+  %res3 = add <16 x i16> %res, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 8ab34bd8c436e..a4f3e666833ae 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -6387,3 +6387,1036 @@ define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4
   %res4 = add <4 x i64> %res3, %res2
  ret <4 x i64> %res4
 }
+
+define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x float> %res2, %res1
+  ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8)
+
+define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovups (%rdi), %ymm0
+; CHECK-NEXT:    vmovups (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    vmovups (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x float> %res2, %res1
+  ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8)
+
+define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovapd (%rdi), %ymm0
+; CHECK-NEXT:    vmovapd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <4 x double> %res2, %res1
+  ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8)
+
+define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovupd (%rdi), %ymm0
+; CHECK-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    vmovupd (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <4 x double> %res2, %res1
+  ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8)
+
+define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
+  %res4 = fadd <4 x float> %res2, %res1
+  ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8)
+
+define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovups (%rdi), %xmm0
+; CHECK-NEXT:    vmovups (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    vmovups (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
+  %res4 = fadd <4 x float> %res2, %res1
+  ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8)
+
+define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovapd (%rdi), %xmm0
+; CHECK-NEXT:    vmovapd (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <2 x double> %res2, %res1
+  ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8)
+
+define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovupd (%rdi), %xmm0
+; CHECK-NEXT:    vmovupd (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    vmovupd (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <2 x double> %res2, %res1
+  ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8*, <2 x double>, i8)
+
+declare <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav4_si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsravq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsravq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsravq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsravq %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psllv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psllv2_di:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psllv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_di:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psllv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_si:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm2 {%k1} 
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z} 
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm2 {%k1} 
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z} 
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm2 {%k1} 
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z} 
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm2 {%k1} 
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z} 
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprold $3, %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vprold $3, %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vprold $3, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprold $3, %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vprold $3, %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vprold $3, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprolq $3, %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vprolq $3, %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vprolq $3, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprolq $3, %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vprolq $3, %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vprolq $3, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm2 {%k1} 
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z} 
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm2 {%k1} 
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z} 
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm2 {%k1} 
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z} 
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprord $3, %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vprord $3, %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vprord $3, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprord $3, %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vprord $3, %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vprord $3, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprorq $3, %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vprorq $3, %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vprorq $3, %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vprorq $3, %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vprorq $3, %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vprorq $3, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
+  %res3 = add <8 x i32> %res, %res1
+  %res4 = add <8 x i32> %res3, %res2
+  ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0 
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
+; CHECK-NEXT:    retq 
+  %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
+  %res3 = add <2 x i64> %res, %res1
+  %res4 = add <2 x i64> %res3, %res2
+  ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax 
+; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm1 {%k1} 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
+; CHECK-NEXT:    retq 
+  %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
+  %res3 = add <4 x i64> %res, %res1
+  %res4 = add <4 x i64> %res3, %res2
+  ret <4 x i64> %res4
+}
diff --git a/test/CodeGen/X86/catchpad-lifetime.ll b/test/CodeGen/X86/catchpad-lifetime.ll
new file mode 100644
index 0000000000000..dfd75334561fa
--- /dev/null
+++ b/test/CodeGen/X86/catchpad-lifetime.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @throw()
+
+declare i32 @__CxxFrameHandler3(...)
+
+define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %alloca2 = alloca i8*, align 4
+  %alloca1 = alloca i8*, align 4
+  store volatile i8* null, i8** %alloca1
+  invoke void @throw()
+          to label %unreachable unwind label %catch.dispatch
+
+; CHECK-LABEL: test1:
+; CHECK: movq  $0, -16(%rbp)
+; CHECK: callq throw
+
+catch.dispatch:                                   ; preds = %entry
+  %cs = catchswitch within none [label %catch.pad] unwind to caller
+
+catch.pad:                                        ; preds = %catch.dispatch
+  %cp = catchpad within %cs [i8* null, i32 0, i8** %alloca1]
+  store volatile i8* null, i8** %alloca1
+  %bc1 = bitcast i8** %alloca1 to i8*
+  call void @llvm.lifetime.end(i64 4, i8* nonnull %bc1)
+  %bc2 = bitcast i8** %alloca2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %bc2)
+  store volatile i8* null, i8** %alloca1
+  unreachable
+
+; CHECK-LABEL: "?catch$2@?0?test1@4HA"
+; CHECK: movq  $0, -16(%rbp)
+; CHECK: movq  $0, -16(%rbp)
+; CHECK: ud2
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: $cppxdata$test1:
+; CHECK: .long   32                      # CatchObjOffset
+
+define void @test2() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %alloca2 = alloca i8*, align 4
+  %alloca1 = alloca i8*, align 4
+  store volatile i8* null, i8** %alloca1
+  invoke void @throw()
+          to label %unreachable unwind label %catch.dispatch
+
+; CHECK-LABEL: test2:
+; CHECK: movq  $0, -16(%rbp)
+; CHECK: callq throw
+
+catch.dispatch:                                   ; preds = %entry
+  %cs = catchswitch within none [label %catch.pad] unwind to caller
+
+catch.pad:                                        ; preds = %catch.dispatch
+  %cp = catchpad within %cs [i8* null, i32 0, i8** null]
+  store volatile i8* null, i8** %alloca1
+  %bc1 = bitcast i8** %alloca1 to i8*
+  call void @llvm.lifetime.end(i64 4, i8* nonnull %bc1)
+  %bc2 = bitcast i8** %alloca2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %bc2)
+  store volatile i8* null, i8** %alloca1
+  unreachable
+
+; CHECK-LABEL: "?catch$2@?0?test2@4HA"
+; CHECK: movq  $0, -16(%rbp)
+; CHECK: movq  $0, -16(%rbp)
+; CHECK: ud2
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: $cppxdata$test2:
+; CHECK: .long   0                       # CatchObjOffset
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+attributes #0 = { argmemonly nounwind }
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
index c229521cc9a4b..70fe501040bf7 100644
--- a/test/CodeGen/X86/cxx_tlscc64.ll
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -1,5 +1,9 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck --check-prefix=SHRINK %s
+; TLS function were wrongly model and after fixing that, shrink-wrapping
+; cannot help here. To achieve the expected lowering, we need to playing
+; tricks similar to AArch64 fast TLS calling convention (r255821).
+; Applying tricks on x86-64 similar to r255821.
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck %s
 %struct.S = type { i8 }
 
 @sg = internal thread_local global %struct.S zeroinitializer, align 1
@@ -12,51 +16,28 @@ declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
 
 ; Every GPR should be saved - except rdi, rax, and rsp
 ; CHECK-LABEL: _ZTW2sg
-; CHECK: pushq %r11
-; CHECK: pushq %r10
-; CHECK: pushq %r9
-; CHECK: pushq %r8
-; CHECK: pushq %rsi
-; CHECK: pushq %rdx
-; CHECK: pushq %rcx
-; CHECK: pushq %rbx
+; CHECK-NOT: pushq %r11
+; CHECK-NOT: pushq %r10
+; CHECK-NOT: pushq %r9
+; CHECK-NOT: pushq %r8
+; CHECK-NOT: pushq %rsi
+; CHECK-NOT: pushq %rdx
+; CHECK-NOT: pushq %rcx
+; CHECK-NOT: pushq %rbx
 ; CHECK: callq
 ; CHECK: jne
 ; CHECK: callq
 ; CHECK: tlv_atexit
 ; CHECK: callq
-; CHECK: popq %rbx
-; CHECK: popq %rcx
-; CHECK: popq %rdx
-; CHECK: popq %rsi
-; CHECK: popq %r8
-; CHECK: popq %r9
-; CHECK: popq %r10
-; CHECK: popq %r11
-; SHRINK-LABEL: _ZTW2sg
-; SHRINK: callq
-; SHRINK: jne
-; SHRINK: pushq %r11
-; SHRINK: pushq %r10
-; SHRINK: pushq %r9
-; SHRINK: pushq %r8
-; SHRINK: pushq %rsi
-; SHRINK: pushq %rdx
-; SHRINK: pushq %rcx
-; SHRINK: pushq %rbx
-; SHRINK: callq
-; SHRINK: tlv_atexit
-; SHRINK: popq %rbx
-; SHRINK: popq %rcx
-; SHRINK: popq %rdx
-; SHRINK: popq %rsi
-; SHRINK: popq %r8
-; SHRINK: popq %r9
-; SHRINK: popq %r10
-; SHRINK: popq %r11
-; SHRINK: LBB{{.*}}:
-; SHRINK: callq
-define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
+; CHECK-NOT: popq %rbx
+; CHECK-NOT: popq %rcx
+; CHECK-NOT: popq %rdx
+; CHECK-NOT: popq %rsi
+; CHECK-NOT: popq %r8
+; CHECK-NOT: popq %r9
+; CHECK-NOT: popq %r10
+; CHECK-NOT: popq %r11
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
   %.b.i = load i1, i1* @__tls_guard, align 1
   br i1 %.b.i, label %__tls_init.exit, label %init.i
 
diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll
index be1dcff7ae857..bff0e64910bf3 100644
--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 14
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 13
 
 define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 485592aeac385..a78022ac5505d 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -61,6 +61,18 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
   ret <4 x float> %res
 }
 
+define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
+; CHECK-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; CHECK-NOT: vmov
+; CHECK: vcvtph2ps (%
+
+  %load = load i64, i64* %ptr
+  %ins = insertelement <2 x i64> undef, i64 %load, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc)
+  ret <4 x float> %res
+}
+
 define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind {
 entry:
   ; CHECK-LABEL: test_x86_vcvtps2ph_256_m:
diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll
index 655f8f49f8381..f2596b6347b90 100644
--- a/test/CodeGen/X86/insertps-combine.ll
+++ b/test/CodeGen/X86/insertps-combine.ll
@@ -109,3 +109,36 @@ define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
   %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
   ret <4 x float> %vecinit4
 }
+
+define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: extract_zero_insertps_z0z7:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: extract_zero_insertps_z0z7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21)
+  %ext = extractelement <4 x float> %res, i32 0
+  ret float %ext
+}
+
+define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
+; SSE-LABEL: extract_lane_insertps_5123:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: extract_lane_insertps_5123:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    retq
+  %a1 = load <4 x float>, <4 x float> *%p1
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
+  %ext = extractelement <4 x float> %res, i32 0
+  ret float %ext
+}
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll
index 571f2d9084c4d..8096bfabd6cfe 100644
--- a/test/CodeGen/X86/lea-opt.ll
+++ b/test/CodeGen/X86/lea-opt.ll
@@ -129,3 +129,41 @@ sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1,
 ; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
 ; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
 }
+
+define void @test4(i64 %x) nounwind minsize {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
+  %tmp1 = load i32, i32* %b, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
+  %tmp2 = load i32, i32* %c, align 4
+  %add = add nsw i32 %sub, %tmp2
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %b, align 4
+  store i32 222, i32* %c, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %b, align 4
+  store i32 444, i32* %c, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test4:
+; CHECK:	leaq arr1+4({{.*}}), [[REG2:%[a-z]+]]
+; CHECK:	movl -4([[REG2]]), {{.*}}
+; CHECK:	subl ([[REG2]]), {{.*}}
+; CHECK:	addl 4([[REG2]]), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, 4([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, 4([[REG2]])
+}
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
index 2228fbbaa53b9..691d75b0e5016 100644
--- a/test/CodeGen/X86/pr13577.ll
+++ b/test/CodeGen/X86/pr13577.ll
@@ -18,3 +18,19 @@ define x86_fp80 @foo(x86_fp80 %a) {
 }
 
 declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
+
+; This would crash:
+; https://llvm.org/bugs/show_bug.cgi?id=26070
+
+define float @pr26070() {
+  %c = call float @copysignf(float 1.0, float undef) readnone
+  ret float %c
+
+; CHECK-LABEL: pr26070:
+; CHECK:       andps
+; CHECK-NEXT:  orps
+; CHECK-NEXT:  retq
+}
+
+declare float @copysignf(float, float)
+
diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll
index 93039859cdfb4..4a16c3198aa5f 100644
--- a/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -74,9 +74,16 @@ define x86_fp80 @s32_to_x(i32 %a) nounwind {
 }
 
 ; CHECK-LABEL: u64_to_f
+; AVX512_32: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512_32: fildll
+
 ; AVX512_64: vcvtusi2ssq
+
+; SSE2_32: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32: movq %xmm0, {{[0-9]+}}(%esp)
 ; SSE2_32: fildll
+
 ; SSE2_64: cvtsi2ssq
 ; X87: fildll
 define float @u64_to_f(i64 %a) nounwind {
@@ -95,6 +102,24 @@ define float @s64_to_f(i64 %a) nounwind {
   ret float %r
 }
 
+; CHECK-LABEL: s64_to_f_2
+; SSE2_32:    movd %ecx, %xmm0
+; SSE2_32:    movd %eax, %xmm1
+; SSE2_32:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2_32:    movq %xmm1, {{[0-9]+}}(%esp)
+; SSE2_32:    fildll {{[0-9]+}}(%esp)
+
+; AVX512_32:    vmovd %eax, %xmm0
+; AVX512_32:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512_32:    vmovlpd %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32:    fildll {{[0-9]+}}(%esp)
+
+define float @s64_to_f_2(i64 %a) nounwind {
+  %a1 = add i64 %a, 5
+  %r = sitofp i64 %a1 to float
+  ret float %r
+}
+
 ; CHECK-LABEL: u64_to_d
 ; AVX512_32: vpunpckldq
 ; AVX512_64: vcvtusi2sdq
@@ -117,6 +142,24 @@ define double @s64_to_d(i64 %a) nounwind {
   ret double %r
 }
 
+; CHECK-LABEL: s64_to_d_2
+; SSE2_32: movd %ecx, %xmm0
+; SSE2_32: movd %eax, %xmm1
+; SSE2_32: punpckldq %xmm0, %xmm1
+; SSE2_32: movq %xmm1, {{[0-9]+}}(%esp)
+; SSE2_32: fildll
+
+; AVX512_32:    vmovd %eax, %xmm0
+; AVX512_32:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512_32:    vmovlpd %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32: fildll
+
+define double @s64_to_d_2(i64 %a) nounwind {
+  %b = add i64 %a, 5
+  %f = sitofp i64 %b to double
+  ret double %f
+}
+
 ; CHECK-LABEL: u64_to_x
 ; CHECK: fildll
 define x86_fp80 @u64_to_x(i64 %a) nounwind {
diff --git a/test/CodeGen/X86/shrinkwrap-hang.ll b/test/CodeGen/X86/shrinkwrap-hang.ll
new file mode 100644
index 0000000000000..e1e4eefb3efa6
--- /dev/null
+++ b/test/CodeGen/X86/shrinkwrap-hang.ll
@@ -0,0 +1,32 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+@b = global i32 1, align 4
+@a = common global i32 0, align 4
+
+declare void @fn1() #0
+
+; CHECK-LABEL: fn2:
+define void @fn2() #0 {
+entry:
+  %0 = load i32, i32* @b, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %lbl
+
+lbl:                                              ; preds = %if.end, %entry
+  store i32 0, i32* @b, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %lbl
+  tail call void @fn1()
+  %1 = load i32, i32* @b, align 4
+  %tobool1 = icmp eq i32 %1, 0
+  br i1 %tobool1, label %if.end3, label %lbl
+
+if.end3:                                          ; preds = %if.end
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 105115bc7d25c..9f689cfe85e56 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -307,7 +307,7 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 
 define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
   ;CHECK-LABEL: stack_fold_cvtsd2si64_int
-  ;CHECK:       cvtsd2siq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+  ;CHECK:       cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
   ret i64 %2
diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll
new file mode 100644
index 0000000000000..9d80e9217b497
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-vector.ll
@@ -0,0 +1,162 @@
+; RUN: llc -mcpu=core-avx -debug-only=stackmaps < %s | FileCheck %s
+; REQUIRES: asserts
+
+target triple = "x86_64-pc-linux-gnu"
+
+; Can we lower a single vector?
+define <2 x i8 addrspace(1)*> @test(<2 x i8 addrspace(1)*> %obj) gc "statepoint-example" {
+entry:
+; CHECK-LABEL: @test
+; CHECK: subq	$24, %rsp
+; CHECK: movaps	%xmm0, (%rsp)
+; CHECK: callq	do_safepoint
+; CHECK: movaps	(%rsp), %xmm0
+; CHECK: addq	$24, %rsp
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> %obj)
+  %obj.relocated = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %safepoint_token, i32 7, i32 7) ; (%obj, %obj)
+  ret <2 x i8 addrspace(1)*> %obj.relocated
+}
+
+; Can we lower the base, derived pairs if both are vectors?
+define <2 x i8 addrspace(1)*> @test2(<2 x i8 addrspace(1)*> %obj, i64 %offset) gc "statepoint-example" {
+entry:
+; CHECK-LABEL: @test2
+; CHECK: subq	$40, %rsp
+; CHECK: movd	%rdi, %xmm1
+; CHECK: pshufd	$68, %xmm1, %xmm1       # xmm1 = xmm1[0,1,0,1]
+; CHECK: paddq	%xmm0, %xmm1
+; CHECK: movdqa	%xmm0, 16(%rsp)
+; CHECK: movdqa	%xmm1, (%rsp)
+; CHECK: callq	do_safepoint
+; CHECK: movaps	(%rsp), %xmm0
+; CHECK: addq	$40, %rsp
+  %derived = getelementptr i8, <2 x i8 addrspace(1)*> %obj, i64 %offset
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> %obj, <2 x i8 addrspace(1)*> %derived)
+  %derived.relocated = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %safepoint_token, i32 7, i32 8) ; (%obj, %derived)
+  ret <2 x i8 addrspace(1)*> %derived.relocated
+}
+
+; Originally, this was just a variant of @test2 above, but it ends up 
+; covering a bunch of interesting missed optimizations.  Specifically:
+; - We waste a stack slot for a value that a backend transform pass
+;   CSEd to another spilled one.
+; - We don't remove the testb even though it serves no purpose
+; - We could in principal reuse the argument memory (%rsi) and do away
+;   with stack slots entirely.
+define <2 x i64 addrspace(1)*> @test3(i1 %cnd, <2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" {
+entry:
+; CHECK-LABEL: @test3
+; CHECK: subq	$40, %rsp
+; CHECK: testb	$1, %dil
+; CHECK: movaps	(%rsi), %xmm0
+; CHECK: movaps	%xmm0, 16(%rsp)
+; CHECK: movaps	%xmm0, (%rsp)
+; CHECK: callq	do_safepoint
+; CHECK: movaps	(%rsp), %xmm0
+; CHECK: addq	$40, %rsp
+  br i1 %cnd, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+  %obja = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+untaken:                                          ; preds = %entry
+  %objb = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+  %obj.base = phi <2 x i64 addrspace(1)*> [ %obja, %taken ], [ %objb, %untaken ]
+  %obj = phi <2 x i64 addrspace(1)*> [ %obja, %taken ], [ %objb, %untaken ]
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i64 addrspace(1)*> %obj, <2 x i64 addrspace(1)*> %obj.base)
+  %obj.relocated = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %safepoint_token, i32 8, i32 7) ; (%obj.base, %obj)
+  %obj.relocated.casted = bitcast <2 x i8 addrspace(1)*> %obj.relocated to <2 x i64 addrspace(1)*>
+  %obj.base.relocated = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %safepoint_token, i32 8, i32 8) ; (%obj.base, %obj.base)
+  %obj.base.relocated.casted = bitcast <2 x i8 addrspace(1)*> %obj.base.relocated to <2 x i64 addrspace(1)*>
+  ret <2 x i64 addrspace(1)*> %obj.relocated.casted
+}
+
+; Can we handle vector constants?  At the moment, we don't appear to actually
+; get selection dag nodes for these.
+define <2 x i8 addrspace(1)*> @test4() gc "statepoint-example" {
+entry:
+; CHECK-LABEL: @test4
+; CHECK: subq	$24, %rsp
+; CHECK: xorps %xmm0, %xmm0
+; CHECK: movaps	%xmm0, (%rsp)
+; CHECK: callq	do_safepoint
+; CHECK: movaps	(%rsp), %xmm0
+; CHECK: addq	$24, %rsp
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> zeroinitializer)
+  %obj.relocated = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %safepoint_token, i32 7, i32 7) ; (%obj, %obj)
+  ret <2 x i8 addrspace(1)*> %obj.relocated
+}
+
+; Check that we can lower a constant typed as i128 correctly.  Note that the
+; actual value is representable in 64 bits.  We don't have a representation 
+; of larger than 64 bit constant in the StackMap format.
+define void @test5() gc "statepoint-example" {
+entry:
+; CHECK-LABEL: @test5
+; CHECK: push
+; CHECK: callq	do_safepoint
+; CHECK: pop
+  %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 1, i128 0)
+  ret void
+}
+
+; CHECK: __LLVM_StackMaps:
+
+; CHECK: .Ltmp1-test
+; Check for the two spill slots
+; Stack Maps: 		Loc 3: Indirect 7+0	[encoding: .byte 3, .byte 16, .short 7, .int 0]
+; Stack Maps: 		Loc 4: Indirect 7+0	[encoding: .byte 3, .byte 16, .short 7, .int 0]
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	0
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	0
+
+; CHECK: .Ltmp3-test2
+; Check for the two spill slots
+; Stack Maps: 		Loc 3: Indirect 7+16	[encoding: .byte 3, .byte 16, .short 7, .int 16]
+; Stack Maps: 		Loc 4: Indirect 7+0	[encoding: .byte 3, .byte 16, .short 7, .int 0]
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	16
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	0
+
+; CHECK: .Ltmp5-test3
+; Check for the four spill slots
+; Stack Maps: 		Loc 3: Indirect 7+16	[encoding: .byte 3, .byte 16, .short 7, .int 16]
+; Stack Maps: 		Loc 4: Indirect 7+16	[encoding: .byte 3, .byte 16, .short 7, .int 16]
+; Stack Maps: 		Loc 5: Indirect 7+16	[encoding: .byte 3, .byte 16, .short 7, .int 16]
+; Stack Maps: 		Loc 6: Indirect 7+0		[encoding: .byte 3, .byte 16, .short 7, .int 0]
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	16
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	16
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	16
+; CHECK: .byte	3
+; CHECK: .byte	16
+; CHECK: .short	7
+; CHECK: .long	0
+
+declare void @do_safepoint()
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32)
+declare <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token, i32, i32)
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index 1f36d064f873e..dfc186bef052a 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -6,6 +6,10 @@
 ; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST
 ; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 \
 ; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512vl \
+; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
 
 ; CST: [[MASKCSTADDR:.LCPI[0-9_]+]]:
 ; CST-NEXT: .long 65535 # 0xffff
@@ -58,6 +62,16 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
 ; AVX2-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
   %tmp = uitofp <4 x i32> %arg to <4 x float>
   ret <4 x float> %tmp
 }
@@ -125,6 +139,16 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
   %tmp = uitofp <8 x i32> %arg to <8 x float>
   ret <8 x float> %tmp
 }
diff --git a/test/CodeGen/X86/version_directive.ll b/test/CodeGen/X86/version_directive.ll
new file mode 100644
index 0000000000000..8e4e6dc70e61b
--- /dev/null
+++ b/test/CodeGen/X86/version_directive.ll
@@ -0,0 +1,4 @@
+; RUN: llc -mtriple x86_64-apple-darwin15.0.0 -o - /dev/null | FileCheck %s
+; RUN: llc -mtriple x86_64-apple-macosx10.11.0 -o - /dev/null | FileCheck %s
+
+; CHECK: .macosx_version_min 10, 11
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 34e56919468be..609e2cc1158c9 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -878,4 +878,109 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 
 declare i32 @varfunc(i8* nocapture readonly)
 
+@sum1 = external hidden thread_local global i32, align 4
+
+
+; Function Attrs: nounwind
+; Make sure the TLS call used to access @sum1 happens after the prologue
+; and before the epilogue.
+; TLS calls used to be wrongly model and shrink-wrapping would have inserted
+; the prologue and epilogue just around the call to doSomething.
+; PR25820.
+;
+; CHECK-LABEL: tlsCall:
+; CHECK: pushq
+; CHECK: testb $1, %dil
+; CHECK: je [[ELSE_LABEL:LBB[0-9_]+]]
+;
+; master bb
+; CHECK: movq _sum1@TLVP(%rip), %rdi
+; CHECK-NEXT: callq *(%rdi)
+; CHECK: jmp [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; [[ELSE_LABEL]]:
+; CHECK: callq _doSomething
+;
+; [[EXIT_LABEL]]:
+; CHECK: popq
+; CHECK-NEXT: retq
+define i32 @tlsCall(i1 %bool1, i32 %arg, i32* readonly dereferenceable(4) %sum1) #3 {
+entry:
+  br i1 %bool1, label %master, label %else
+
+master:
+  %tmp1 = load i32, i32* %sum1, align 4
+  store i32 %tmp1, i32* @sum1, align 4
+  br label %exit
+
+else:
+  %call = call i32 @doSomething(i32 0, i32* null)
+  br label %exit
+
+exit:
+  %res = phi i32 [ %arg, %master], [ %call, %else ]
+  ret i32 %res
+}
+
 attributes #3 = { nounwind }
+
+@irreducibleCFGa = common global i32 0, align 4
+@irreducibleCFGf = common global i8 0, align 1
+@irreducibleCFGb = common global i32 0, align 4
+
+; Check that we do not run shrink-wrapping on irreducible CFGs until
+; it is actually supported.
+; At the moment, on those CFGs the loop information may be incorrect
+; and since we use that information to do the placement, we may end up
+; inserting the prologue/epilogue at incorrect places.
+; PR25988.
+;
+; CHECK-LABEL: irreducibleCFG:
+; CHECK: %entry
+; Make sure the prologue happens in the entry block.
+; CHECK-NEXT: pushq
+; ...
+; Make sure the epilogue happens in the exit block.
+; CHECK-NOT: popq
+; CHECK: popq
+; CHECK-NEXT: popq
+; CHECK-NEXT: retq
+define i32 @irreducibleCFG() #4 {
+entry:
+  %i0 = load i32, i32* @irreducibleCFGa, align 4
+  %.pr = load i8, i8* @irreducibleCFGf, align 1
+  %bool = icmp eq i8 %.pr, 0
+  br i1 %bool, label %split, label %preheader
+
+preheader:
+  br label %preheader
+
+split:
+  %i1 = load i32, i32* @irreducibleCFGb, align 4
+  %tobool1.i = icmp ne i32 %i1, 0
+  br i1 %tobool1.i, label %for.body4.i, label %for.cond8.i.preheader
+
+for.body4.i:
+  %call.i = tail call i32 (...) @something(i32 %i0)
+  br label %for.cond8
+
+for.cond8:
+  %p1 = phi i32 [ %inc18.i, %for.inc ], [ 0, %for.body4.i ]
+  %.pr1.pr = load i32, i32* @irreducibleCFGb, align 4
+  br label %for.cond8.i.preheader
+
+for.cond8.i.preheader:
+  %.pr1 = phi i32 [ %.pr1.pr, %for.cond8 ], [ %i1, %split ]
+  %p13 = phi i32 [ %p1, %for.cond8 ], [ 0, %split ]
+  br label %for.inc
+
+fn1.exit:
+  ret i32 0
+
+for.inc:
+  %inc18.i = add nuw nsw i32 %p13, 1
+  %cmp = icmp slt i32 %inc18.i, 7
+  br i1 %cmp, label %for.cond8, label %fn1.exit
+}
+
+attributes #4 = { "no-frame-pointer-elim"="true" }