58 files changed, 1940 insertions, 567 deletions
diff --git a/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll b/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
new file mode 100644
index 000000000000..a71b5e86138d
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-vcvtfp2fxs-combine.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=aarch64-linux-eabi -o - | FileCheck %s
+
+%struct.a= type { i64, i64, i64, i64 }
+
+; DAG combine will try to perform a transformation that  creates a vcvtfp2fxs
+; with a v4f64 input. Since v4i64 is not legal we should bail out. We can
+; pottentially still create the vcvtfp2fxs node after legalization (but on a
+; v2f64).
+
+; CHECK-LABEL: fun1
+define void @fun1() local_unnamed_addr {
+entry:
+  %mul = fmul <4 x double> zeroinitializer, <double 6.553600e+04, double 6.553600e+04, double 6.553600e+04, double 6.553600e+04>
+  %toi = fptosi <4 x double> %mul to <4 x i64>
+  %ptr = getelementptr inbounds %struct.a, %struct.a* undef, i64 0, i32 2
+  %elem = extractelement <4 x i64> %toi, i32 1
+  store i64 %elem, i64* %ptr, align 8
+  call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap()
+
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
index a12132f425d9..d78c75165be2 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
@@ -1,8 +1,246 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
-; RUN: opt -S -amdgpu-codegenprepare < %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
+; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
 ; Make sure this doesn't crash with no triple
 
-; CHECK-LABEL: @foo(
-define void @foo() {
+; NOOP-LABEL: @noop_fdiv_fpmath(
+; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
+define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: @fdiv_fpmath(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
+; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath(
+; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
+; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
+; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
+; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
+; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
+; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
+define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+  %no.md = fdiv float 1.0, %x
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float 1.0, %x, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp float 1.0, %x
+  store volatile float %arcp.no.md, float addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
+  store volatile float %arcp.25ulp, float addrspace(1)* %out
+
+  %fast.no.md = fdiv fast float 1.0, %x
+  store volatile float %fast.no.md, float addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
+  store volatile float %fast.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+
+; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
+; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
+; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
+; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
+define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+  %no.md = fdiv <2 x float> %a, %b
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+  store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
+
+  %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
+  store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; FIXME: Should be able to get fdiv for 1.0 component
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+  %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
+
+  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
+; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+attributes #0 = { nounwind optnone noinline }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+
+; CHECK: !0 = !{float 2.500000e+00}
+; CHECK: !1 = !{float 5.000000e-01}
+; CHECK: !2 = !{float 1.000000e+00}
+; CHECK: !3 = !{float 3.000000e+00}
+
+!0 = !{float 2.500000e+00}
+!1 = !{float 5.000000e-01}
+!2 = !{float 1.000000e+00}
+!3 = !{float 3.000000e+00}
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 7b5158629091..bd0817d30413 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -417,12 +417,6 @@ entry:
   ret void
 }
 
-; HSAOPT: !0 = !{}
-; HSAOPT: !1 = !{i32 0, i32 2048}
-
-; NOHSAOPT: !0 = !{i32 0, i32 2048}
-
-
 ; FUNC-LABEL: v16i32_stack:
 
 ; R600: MOVA_INT
@@ -527,4 +521,33 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
   ret void
 }
 
+; OPT-LABEL: @direct_alloca_read_0xi32(
+; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
+; OPT: load [0 x i32], [0 x i32] addrspace(3)*
+define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [0 x i32]
+  store [0 x i32] [], [0 x i32]* %tmp
+  %load = load [0 x i32], [0 x i32]* %tmp
+  store [0 x i32] %load, [0 x i32] addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @direct_alloca_read_1xi32(
+; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
+; OPT: load [1 x i32], [1 x i32] addrspace(3)*
+define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [1 x i32]
+  store [1 x i32] [i32 0], [1 x i32]* %tmp
+  %load = load [1 x i32], [1 x i32]* %tmp
+  store [1 x i32] %load, [1 x i32] addrspace(1)* %out
+  ret void
+}
+
 attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
+
+; HSAOPT: !0 = !{}
+; HSAOPT: !1 = !{i32 0, i32 2048}
+
+; NOHSAOPT: !0 = !{i32 0, i32 2048}
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index ff730a085255..00636240bc6c 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -6,7 +6,6 @@
 ; GCN-LABEL: {{^}}test_branch:
 ; GCNNOOPT: v_writelane_b32
 ; GCNNOOPT: v_writelane_b32
-; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; BB#1
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 4021233e7785..65464cdba604 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -1,8 +1,4 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
@@ -15,22 +11,59 @@
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
+; SI: v_div_scale_f32
+; SI-DAG: v_div_scale_f32
 
 ; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_mul_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_div_fmas_f32
+; SI: v_div_fixup_f32
+define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+  %fdiv = fdiv float %a, %b
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
+; SI: v_cndmask_b32
+; SI: v_mul_f32
+; SI: v_rcp_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+  %fdiv = fdiv float %a, %b, !fpmath !0
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+; Use correct fdiv
+; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
+; SI: v_fma_f32
+; SI: v_div_fmas_f32
+; SI: v_div_fixup_f32
+define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+entry:
+  %fdiv = fdiv float %a, %b, !fpmath !0
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
 
-; I754-DAG: v_div_scale_f32
-; I754-DAG: v_rcp_f32
-; I754-DAG: v_fma_f32
-; I754-DAG: v_mul_f32
-; I754-DAG: v_fma_f32
-; I754-DAG: v_div_fixup_f32
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
+; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
-  %0 = fdiv float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv fast float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -38,15 +71,14 @@ entry:
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
-  %0 = fdiv fast float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv fast float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -54,15 +86,14 @@ entry:
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
+; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
-  %0 = fdiv arcp float %a, %b
-  store float %0, float addrspace(1)* %out
+  %fdiv = fdiv arcp float %a, %b
+  store float %fdiv, float addrspace(1)* %out
   ret void
 }
 
@@ -72,26 +103,24 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+; SI: v_div_scale_f32
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+entry:
+  %fdiv = fdiv <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
+  ret void
+}
 
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
+; SI: v_cmp_gt_f32
+; SI: v_cmp_gt_f32
+define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -101,19 +130,12 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv fast <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv fast <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -123,19 +145,12 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
 
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_rcp_f32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
-  %0 = fdiv arcp <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %fdiv = fdiv arcp <2 x float> %a, %b
+  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
   ret void
 }
 
@@ -149,37 +164,11 @@ entry:
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_scale_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-; I754: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+; SI: v_div_fixup_f32
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -198,24 +187,11 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -234,24 +210,11 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
 
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_rcp_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-; UNSAFE-FP: v_mul_f32_e32
-
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-; SI-DAG: v_rcp_f32
-; SI-DAG: v_mul_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+; SI: v_rcp_f32
+define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -259,3 +222,9 @@ define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> ad
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
+attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
+attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
+
+!0 = !{float 2.500000e+00}
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index be23e10d7087..1537d67cadcc 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare double @llvm.fabs.f64(double) #1
 
 ; FUNC-LABEL: @fp_to_sint_f64_i32
 ; SI: v_cvt_i32_f64_e32
@@ -54,3 +55,23 @@ define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
   store i64 %cast, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}}
+define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %conv = fptosi double %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}|
+define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %in.fabs = call double @llvm.fabs.f64(double %in)
+  %conv = fptosi double %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
index b39aeadc8cce..0cd0358bafd5 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
-declare float @llvm.fabs.f32(float) #0
+declare float @llvm.fabs.f32(float) #1
 
 ; FUNC-LABEL: {{^}}fp_to_sint_i32:
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
@@ -17,7 +17,7 @@ define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
 ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
 ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
-  %in.fabs = call float @llvm.fabs.f32(float %in) #0
+  %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -227,4 +227,26 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   ret void
 }
 
-attributes #0 = { nounwind readnone }
+; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
+
+; EG: AND_INT
+; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y,
+; EG-NEXT: -1082130432(-1.000000e+00)
+define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %conv = fptosi float %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}|
+define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %in.fabs = call float @llvm.fabs.f32(float %in)
+  %conv = fptosi float %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index 760019ebdc08..d5bc416434df 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare double @llvm.fabs.f64(double) #1
 
 ; SI-LABEL: {{^}}fp_to_uint_i32_f64:
 ; SI: v_cvt_u32_f64_e32
@@ -68,3 +69,23 @@ define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %
   store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
   ret void
 }
+
+; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}}
+define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %conv = fptoui double %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1:
+; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}|
+define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+  %in.fabs = call double @llvm.fabs.f64(double %in)
+  %conv = fptoui double %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll
index b7b6ccc238b3..8a0f9fa2ac2b 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -1,6 +1,8 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+
+declare float @llvm.fabs.f32(float) #1
 
 ; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32:
 ; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
@@ -215,3 +217,27 @@ define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+
+; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
+
+; EG: AND_INT
+; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0,
+define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %conv = fptoui float %in to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
+; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}|
+define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+  %in.fabs = call float @llvm.fabs.f32(float %in)
+  %conv = fptoui float %in.fabs to i1
+  store i1 %conv, i1 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
new file mode 100644
index 000000000000..4e17a921d91b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
@@ -0,0 +1,8 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; check llc does not crash for invalid opencl version metadata
+
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .short	256
+
+!opencl.ocl.version = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
new file mode 100644
index 000000000000..35b7d70596c1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; check llc does not crash for invalid opencl version metadata
+
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .short	256
+
+!opencl.ocl.version = !{!0}
+!0 = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
new file mode 100644
index 000000000000..e1693551b621
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+; check llc does not crash for invalid opencl version metadata
+
+; CHECK: .section        .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte	1
+; CHECK-NEXT: .short	256
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
new file mode 100644
index 000000000000..54d7848da3bf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.amdgcn.fdiv.fast(float, float) #0
+
+; CHECK-LABEL: {{^}}test_fdiv_fast:
+; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; CHECK: v_mul_f32_e32
+; CHECK: v_rcp_f32_e32
+; CHECK: v_mul_f32_e32
+; CHECK: v_mul_f32_e32
+define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+  %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index cf6d1ab237cd..6014e2ed85f8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -2,13 +2,14 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
 
-
 @lds0 = addrspace(3) global [512 x float] undef, align 4
 @lds1 = addrspace(3) global [256 x float] undef, align 4
 
-; FUNC-LABEL: {{^}}groupstaticsize_test0:
-; CHECK: s_movk_i32 s{{[0-9]+}}, 0x800
-define void @get_groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+@large = addrspace(3) global [4096 x i32] undef, align 4
+
+; CHECK-LABEL: {{^}}groupstaticsize_test0:
+; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
+define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -20,9 +21,8 @@ define void @get_groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1
   ret void
 }
 
-
-; FUNC-LABEL: {{^}}groupstaticsize_test1:
-; CHECK: s_movk_i32 s{{[0-9]+}}, 0xc00
+; CHECK-LABEL: {{^}}groupstaticsize_test1:
+; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
 define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -48,6 +48,16 @@ endif:                                            ; preds = %else, %if
   ret void
 }
 
+; Exceeds 16-bit simm limit of s_movk_i32
+; CHECK-LABEL: {{^}}large_groupstaticsize:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
+define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
+  %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
+  store volatile i32 0, i32 addrspace(3)* %gep
+  %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
+  store i32 %static_lds_size, i32 addrspace(1)* %size
+  ret void
+}
 
 declare i32 @llvm.amdgcn.groupstaticsize() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll
index b1d422062543..27a88f7b59e7 100644
--- a/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -1,11 +1,96 @@
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; FIXME: Evergreen only ever does unsafe fp math.
 ; FUNC-LABEL: {{^}}rcp_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
 ; EG: RECIP_IEEE
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv fast float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %rcp = fdiv arcp float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]]
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
+  %rcp = fdiv float 1.0, %src, !fpmath !0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_fabs_pat_f32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]|
+; GCN: buffer_store_dword [[RCP]]
+
+; EG: RECIP_IEEE
+define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %src.fabs = call float @llvm.fabs.f32(float %src)
+  %rcp = fdiv float 1.0, %src.fabs
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: fneg folded into constant 1
+; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32:
+define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
+  %src.fabs = call float @llvm.fabs.f32(float %src)
+  %src.fabs.fneg = fsub float -0.0, %src.fabs
+  %rcp = fdiv float 1.0, %src.fabs.fneg
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+declare float @llvm.fabs.f32(float) #1
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
+
+!0 = !{float 2.500000e+00}
diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll
deleted file mode 100644
index f9292a788521..000000000000
--- a/test/CodeGen/AMDGPU/reciprocal.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define amdgpu_ps void @test(<4 x float> inreg %reg0) {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = fdiv float 1.0, %r0
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
-   ret void
-}
-
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
index 10187f6125d6..4ba4ac76a280 100644
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -348,7 +348,6 @@ bb7:                                              ; preds = %bb4
 ; CHECK: image_sample_c
 
 ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
-; CHECK: s_and_b64 exec, exec,
 ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
@@ -385,6 +384,7 @@ bb9:                                              ; preds = %bb4
 
 declare void @llvm.AMDGPU.kill(float) #0
 declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 \ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index c151ca9ef9b4..7dcf36f144ac 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -3,6 +3,11 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
+
+; OPT-LABEL: @vector_read(
+; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
 
 ; FUNC-LABEL: {{^}}vector_read:
 ; EG: MOV
@@ -12,21 +17,26 @@
 ; EG: MOVA_INT
 define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
   store i32 0, i32* %x
   store i32 1, i32* %y
   store i32 2, i32* %z
   store i32 3, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index
-  %2 = load i32, i32* %1
-  store i32 %2, i32 addrspace(1)* %out
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
+  %tmp2 = load i32, i32* %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
   ret void
 }
 
+; OPT-LABEL: @vector_write(
+; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
+; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
+; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
+
 ; FUNC-LABEL: {{^}}vector_write:
 ; EG: MOV
 ; EG: MOV
@@ -36,42 +46,95 @@ entry:
 ; EG: MOVA_INT
 define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
   store i32 0, i32* %x
   store i32 0, i32* %y
   store i32 0, i32* %z
   store i32 0, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index
-  store i32 1, i32* %1
-  %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index
-  %3 = load i32, i32* %2
-  store i32 %3, i32 addrspace(1)* %out
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %w_index
+  store i32 1, i32* %tmp1
+  %tmp2 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %r_index
+  %tmp3 = load i32, i32* %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %out
   ret void
 }
 
 ; This test should be optimize to:
 ; store i32 0, i32 addrspace(1)* %out
+
+; OPT-LABEL: @bitcast_gep(
+; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
+
 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
 define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
-  %0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
   store i32 0, i32* %x
   store i32 0, i32* %y
   store i32 0, i32* %z
   store i32 0, i32* %w
-  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
-  %2 = bitcast i32* %1 to [4 x i32]*
-  %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0
-  %4 = load i32, i32* %3
-  store i32 %4, i32 addrspace(1)* %out
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %tmp2 = bitcast i32* %tmp1 to [4 x i32]*
+  %tmp3 = getelementptr [4 x i32], [4 x i32]* %tmp2, i32 0, i32 0
+  %tmp4 = load i32, i32* %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_read_bitcast_gep(
+; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [4 x i32]
+  %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
+  %bc = bitcast i32* %x to float*
+  store float 1.0, float* %bc
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
+  %tmp2 = load i32, i32* %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be able to promote this. Instcombine should fold the
+; cast in the hasOneUse case so it might not matter in practice
+
+; OPT-LABEL: @vector_read_bitcast_alloca(
+; OPT: alloca [4 x float]
+; OPT: store float
+; OPT: store float
+; OPT: store float
+; OPT: store float
+; OPT: load float
+define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [4 x i32]
+  %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
+  %x = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 0
+  %y = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 1
+  %z = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 2
+  %w = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 3
+  store float 0.0, float* %x
+  store float 1.0, float* %y
+  store float 2.0, float* %z
+  store float 4.0, float* %w
+  %tmp1 = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 %index
+  %tmp2 = load float, float* %tmp1
+  store float %tmp2, float addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
index 23b0ffd5b3da..809a7ba9b826 100644
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -41,14 +41,14 @@ main_body:
 ;CHECK: store
 ;CHECK-NOT: exec
 ;CHECK: .size test3
-define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
-  %wr = extractelement <4 x float> %tex, i32 1
-  store float %wr, float addrspace(1)* %gep
+
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)
+
   ret <4 x float> %tex
 }
 
@@ -66,8 +66,9 @@ main_body:
 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
 main_body:
   %c.1 = mul i32 %c, %d
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
-  store float %data, float addrspace(1)* %gep
+
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
+
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   ret <4 x float> %tex
 }
@@ -89,7 +90,7 @@ main_body:
 ;CHECK: s_mov_b64 exec, [[SAVED]]
 ;CHECK: %IF
 ;CHECK: image_sample
-define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE
@@ -100,8 +101,7 @@ IF:
   br label %END
 
 ELSE:
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
-  store float %data, float addrspace(1)* %gep
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
   br label %END
 
 END:
@@ -129,7 +129,7 @@ END:
 ;CHECK: s_or_b64 exec, exec,
 ;CHECK: v_mov_b32_e32 v0
 ;CHECK: ; return
-define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %ELSE, label %IF
@@ -140,8 +140,7 @@ IF:
   br label %END
 
 ELSE:
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
-  store float %data, float addrspace(1)* %gep
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
   br label %END
 
 END:
@@ -163,23 +162,20 @@ END:
 ;CHECK: store
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: v_cmp
-define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
 main_body:
   %idx.1 = extractelement <3 x i32> %idx, i32 0
-  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
   %data.1 = extractelement <2 x float> %data, i32 0
-  store float %data.1, float addrspace(1)* %gep.1
+  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
 
   ; The load that determines the branch (and should therefore be WQM) is
   ; surrounded by stores that require disabled WQM.
   %idx.2 = extractelement <3 x i32> %idx, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
-  %z = load float, float addrspace(1)* %gep.2
+  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
 
   %idx.3 = extractelement <3 x i32> %idx, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
   %data.3 = extractelement <2 x float> %data, i32 1
-  store float %data.3, float addrspace(1)* %gep.3
+  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
 
   %cc = fcmp ogt float %z, 0.0
   br i1 %cc, label %IF, label %ELSE
@@ -210,24 +206,21 @@ END:
 ;CHECK: load
 ;CHECK: store
 ;CHECK: v_cmp
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %tex.1 = extractelement <4 x float> %tex, i32 0
 
   %idx.1 = extractelement <3 x i32> %idx, i32 0
-  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
   %data.1 = extractelement <2 x float> %data, i32 0
-  store float %data.1, float addrspace(1)* %gep.1
+  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
 
   %idx.2 = extractelement <3 x i32> %idx, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
-  %z = load float, float addrspace(1)* %gep.2
+  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
 
   %idx.3 = extractelement <3 x i32> %idx, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
   %data.3 = extractelement <2 x float> %data, i32 1
-  store float %data.3, float addrspace(1)* %gep.3
+  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
 
   %cc = fcmp ogt float %z, 0.0
   br i1 %cc, label %IF, label %ELSE
@@ -258,15 +251,14 @@ END:
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: %END
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
 main_body:
   %cond = icmp eq i32 %y, 0
   br i1 %cond, label %IF, label %END
 
 IF:
-  %data = load float, float addrspace(1)* %ptr
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
-  store float %data, float addrspace(1)* %gep
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
   br label %END
 
 END:
@@ -282,13 +274,11 @@ END:
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
-;SI: buffer_store_dword
-;VI: flat_store_dword
+;CHECK: buffer_store_dword
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: v_cmpx_
 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
-;SI: buffer_store_dword
-;VI: flat_store_dword
+;CHECK: buffer_store_dword
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: image_sample
 define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
@@ -296,16 +286,14 @@ main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 
   %idx.0 = extractelement <2 x i32> %idx, i32 0
-  %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
   %data.0 = extractelement <2 x float> %data, i32 0
-  store float %data.0, float addrspace(1)* %gep.0
+  call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
 
   call void @llvm.AMDGPU.kill(float %z)
 
   %idx.1 = extractelement <2 x i32> %idx, i32 1
-  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
   %data.1 = extractelement <2 x float> %data, i32 1
-  store float %data.1, float addrspace(1)* %gep.1
+  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
 
   %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %out = fadd <4 x float> %tex, %tex2
@@ -321,16 +309,14 @@ main_body:
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK: image_sample
 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
-; SI: buffer_store_dword
-; VI: flat_store_dword
+; CHECK: buffer_store_dword
 ; CHECK-NOT: wqm
 ; CHECK: v_cmpx_
-define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
-  store float %data, float addrspace(1)* %gep
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
   call void @llvm.AMDGPU.kill(float %z)
 
@@ -350,9 +336,91 @@ main_body:
   ret float %s
 }
 
+; CHECK-LABEL: {{^}}test_loop_vcc:
+; CHECK-NEXT: ; %entry
+; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: image_store
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2
+; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[LOOPHDR]]: ; %loop
+; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
+; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]]
+; CHECK: s_cbranch_vccz
+; CHECK: ; %break
+
+; CHECK: ; return
+define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
+entry:
+  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
+  br label %loop
+
+loop:
+  %ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ]
+  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
+  %cc = icmp sgt i32 %ctr.iv, 7
+  br i1 %cc, label %break, label %body
+
+body:
+  %c.i = bitcast <4 x float> %c.iv to <4 x i32>
+  %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %ctr.next = add i32 %ctr.iv, 2
+  br label %loop
+
+break:
+  ret <4 x float> %c.iv
+}
+
+; Only intrinsic stores need exact execution -- other stores do not have
+; externally visible effects and may require WQM for correctness.
+;
+; CHECK-LABEL: {{^}}test_alloca:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+
+; CHECK: image_sample
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dwordx4
+define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
+entry:
+  %array = alloca [32 x i32], align 4
+
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+  %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
+  store volatile i32 %a, i32* %s.gep, align 4
+
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
+
+  %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
+  %c = load i32, i32* %c.gep, align 4
+
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+  ret void
+}
+
+
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
 
 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
 
 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 151cc1b12ed2..04eae8f9afec 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -49,7 +49,7 @@ tailrecurse.switch:                               ; preds = %tailrecurse
 ; V8-NEXT: beq
 ; V8-NEXT: %tailrecurse.switch
 ; V8: cmp
-; V8-NEXT: beq
+; V8-NEXT: bne
 ; V8-NEXT: b	
 ; The trailing space in the last line checks that the branch is unconditional
   switch i32 %and, label %sw.epilog [
diff --git a/test/CodeGen/ARM/ssat-v4t.ll b/test/CodeGen/ARM/ssat-v4t.ll
new file mode 100644
index 000000000000..3d74c88da827
--- /dev/null
+++ b/test/CodeGen/ARM/ssat-v4t.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -O1 -mtriple=armv4t-none-none-eabi %s -o - 2>&1 | FileCheck %s
+
+; CHECK: Cannot select: intrinsic %llvm.arm.ssat
+define i32 @ssat() nounwind {
+  %tmp = call i32 @llvm.arm.ssat(i32 128, i32 1)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.ssat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/ARM/ssat.ll b/test/CodeGen/ARM/ssat.ll
index 2b75bc410aa8..f1e11dd33d1f 100644
--- a/test/CodeGen/ARM/ssat.ll
+++ b/test/CodeGen/ARM/ssat.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv4t-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V4T
+; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2
 
 ; Check for several conditions that should result in SSAT.
 ; For example, the base test is equivalent to
@@ -16,7 +17,8 @@
 ; 32-bit base test
 define i32 @sat_base_32bit(i32 %x) #0 {
 ; CHECK-LABEL: sat_base_32bit:
-; CHECK: ssat r0, #24, r0
+; V6T2: ssat r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i32 %x, -8388608
   %cmpUp = icmp sgt i32 %x, 8388607
@@ -29,7 +31,8 @@ entry:
 ; 16-bit base test
 define i16 @sat_base_16bit(i16 %x) #0 {
 ; CHECK-LABEL: sat_base_16bit:
-; CHECK: ssat r0, #12, r0
+; V6T2: ssat r0, #12, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i16 %x, -2048
   %cmpUp = icmp sgt i16 %x, 2047
@@ -42,7 +45,8 @@ entry:
 ; 8-bit base test
 define i8 @sat_base_8bit(i8 %x) #0 {
 ; CHECK-LABEL: sat_base_8bit:
-; CHECK: ssat r0, #6, r0
+; V6T2: ssat r0, #6, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i8 %x, -32
   %cmpUp = icmp sgt i8 %x, 31
@@ -60,7 +64,8 @@ entry:
 ; x < -k ? -k : (x < k ? x : k)
 define i32 @sat_lower_upper_1(i32 %x) #0 {
 ; CHECK-LABEL: sat_lower_upper_1:
-; CHECK: ssat r0, #24, r0
+; V6T2: ssat r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp slt i32 %x, -8388608
   %cmpUp = icmp slt i32 %x, 8388607
@@ -72,7 +77,8 @@ entry:
 ; x > -k ? (x > k ? k : x) : -k
 define i32 @sat_lower_upper_2(i32 %x) #0 {
 ; CHECK-LABEL: sat_lower_upper_2:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpLow = icmp sgt i32 %x, -8388608
   %cmpUp = icmp sgt i32 %x, 8388607
@@ -84,7 +90,8 @@ entry:
 ; x < k ? (x < -k ? -k : x) : k
 define i32 @sat_upper_lower_1(i32 %x) #0 {
 ; CHECK-LABEL: sat_upper_lower_1:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp slt i32 %x, 8388607
   %cmpLow = icmp slt i32 %x, -8388608
@@ -96,7 +103,8 @@ entry:
 ; x > k ? k : (x < -k ? -k : x)
 define i32 @sat_upper_lower_2(i32 %x) #0 {
 ; CHECK-LABEL: sat_upper_lower_2:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp sgt i32 %x, 8388607
   %cmpLow = icmp slt i32 %x, -8388608
@@ -108,7 +116,8 @@ entry:
 ; k < x ? k : (x > -k ? x : -k)
 define i32 @sat_upper_lower_3(i32 %x) #0 {
 ; CHECK-LABEL: sat_upper_lower_3:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp slt i32 8388607, %x
   %cmpLow = icmp sgt i32 %x, -8388608
@@ -125,7 +134,8 @@ entry:
 ; k <= x ? k : (x >= -k ? x : -k)
 define i32 @sat_le_ge(i32 %x) #0 {
 ; CHECK-LABEL: sat_le_ge:
-; CHECK: ssat    r0, #24, r0
+; V6T2: ssat    r0, #24, r0
+; V4T-NOT: ssat
 entry:
   %cmpUp = icmp sle i32 8388607, %x
   %cmpLow = icmp sge i32 %x, -8388608
diff --git a/test/CodeGen/ARM/usat-v4t.ll b/test/CodeGen/ARM/usat-v4t.ll
new file mode 100644
index 000000000000..572c760e3ae6
--- /dev/null
+++ b/test/CodeGen/ARM/usat-v4t.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -O1 -mtriple=armv4t-none-none-eabi %s -o - 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.usat
+define i32 @usat1() nounwind {
+  %tmp = call i32 @llvm.arm.usat(i32 128, i32 31)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.arm.usat(i32, i32) nounwind readnone
diff --git a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
index f736ddd0def6..c0229c626a0e 100644
--- a/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
+++ b/test/CodeGen/Mips/2009-11-16-CstPoolLoad.ll
@@ -11,13 +11,13 @@ entry:
 ; PIC-O32: lwc1 $f0, %lo($CPI0_0)($[[R0]])
 ; STATIC-O32: lui  $[[R0:[0-9]+]], %hi($CPI0_0)
 ; STATIC-O32: lwc1 $f0, %lo($CPI0_0)($[[R0]])
-; PIC-N32: lw  $[[R0:[0-9]+]], %got_page($CPI0_0)
-; PIC-N32: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]])
-; STATIC-N32: lui  $[[R0:[0-9]+]], %hi($CPI0_0)
-; STATIC-N32: lwc1 $f0, %lo($CPI0_0)($[[R0]])
-; PIC-N64: ld  $[[R0:[0-9]+]], %got_page($CPI0_0)
-; PIC-N64: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]])
-; STATIC-N64: ld  $[[R0:[0-9]+]], %got_page($CPI0_0)
-; STATIC-N64: lwc1 $f0, %got_ofst($CPI0_0)($[[R0]])
+; PIC-N32: lw  $[[R0:[0-9]+]], %got_page(.LCPI0_0)
+; PIC-N32: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]])
+; STATIC-N32: lui  $[[R0:[0-9]+]], %hi(.LCPI0_0)
+; STATIC-N32: lwc1 $f0, %lo(.LCPI0_0)($[[R0]])
+; PIC-N64: ld  $[[R0:[0-9]+]], %got_page(.LCPI0_0)
+; PIC-N64: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]])
+; STATIC-N64: ld  $[[R0:[0-9]+]], %got_page(.LCPI0_0)
+; STATIC-N64: lwc1 $f0, %got_ofst(.LCPI0_0)($[[R0]])
   ret float 0x400B333340000000
 }
diff --git a/test/CodeGen/Mips/2010-07-20-Switch.ll b/test/CodeGen/Mips/2010-07-20-Switch.ll
index 7d66d1a1a204..5f0a0a5a4929 100644
--- a/test/CodeGen/Mips/2010-07-20-Switch.ll
+++ b/test/CodeGen/Mips/2010-07-20-Switch.ll
@@ -27,9 +27,9 @@ entry:
 ; PIC-O32: addu $[[R5:[0-9]+]], $[[R4:[0-9]+]]
 ; PIC-O32: jr  $[[R5]]
 ; N64: dsll $[[R0:[0-9]+]], ${{[0-9]+}}, 3
-; N64: ld $[[R1:[0-9]+]], %got_page($JTI0_0)
+; N64: ld $[[R1:[0-9]+]], %got_page(.LJTI0_0)
 ; N64: daddu $[[R2:[0-9]+]], $[[R0:[0-9]+]], $[[R1]]
-; N64: ld $[[R4:[0-9]+]], %got_ofst($JTI0_0)($[[R2]])
+; N64: ld $[[R4:[0-9]+]], %got_ofst(.LJTI0_0)($[[R2]])
 ; N64: daddu $[[R5:[0-9]+]], $[[R4:[0-9]+]]
 ; N64: jr  $[[R5]]
   switch i32 %0, label %bb4 [
@@ -68,7 +68,7 @@ bb5:                                              ; preds = %entry
 ; PIC-O32: .gpword 
 ; PIC-O32: .gpword 
 ; N64: .p2align  3
-; N64: $JTI0_0:
+; N64: .LJTI0_0:
 ; N64: .gpdword
 ; N64: .gpdword
 ; N64: .gpdword 
diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll
index 377fe9327e0e..62150875e75b 100644
--- a/test/CodeGen/Mips/analyzebranch.ll
+++ b/test/CodeGen/Mips/analyzebranch.ll
@@ -10,7 +10,7 @@ define double @foo(double %a, double %b) nounwind readnone {
 entry:
 ; ALL-LABEL: foo:
 
-; FCC:           bc1f $BB
+; FCC:           bc1f {{\$|\.L}}BB
 ; FCC:           nop
 
 ; 32-GPR:        mtc1      $zero, $[[Z:f[0-9]]]
@@ -19,7 +19,7 @@ entry:
 ; GPR:           cmp.lt.d  $[[FGRCC:f[0-9]+]], $[[Z]], $f12
 ; GPR:           mfc1      $[[GPRCC:[0-9]+]], $[[FGRCC]]
 ; GPR-NOT:       not       $[[GPRCC]], $[[GPRCC]]
-; GPR:           bnezc     $[[GPRCC]], $BB
+; GPR:           bnezc     $[[GPRCC]], {{\$|\.L}}BB
 
   %cmp = fcmp ogt double %a, 0.000000e+00
   br i1 %cmp, label %if.end6, label %if.else
@@ -43,7 +43,7 @@ define void @f1(float %f) nounwind {
 entry:
 ; ALL-LABEL: f1:
 
-; FCC:           bc1f $BB
+; FCC:           bc1f {{\$|\.L}}BB
 ; FCC:           nop
 
 ; GPR:           mtc1     $zero, $[[Z:f[0-9]]]
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 8f4ccb19958a..dfba8ba19331 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -34,17 +34,17 @@ entry:
 ; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R1:[0-9]+]]
 ; O0-NEXT:       ll      $[[R2:[0-9]+]], 0($[[R1]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R3:[0-9]+]], 0($[[R0]])
 ; ALL:           addu    $[[R4:[0-9]+]], $[[R3]], $4
 ; ALL:           sc      $[[R4]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R4]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R4]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R4]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R4]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R4]], [[BB0]]
+; MIPSR6:        beqzc   $[[R4]], [[BB0]]
 }
 
 define i32 @AtomicLoadNand32(i32 signext %incr) nounwind {
@@ -59,14 +59,14 @@ entry:
 
 
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R1:[0-9]+]], 0($[[R0]])
 ; ALL:           and     $[[R3:[0-9]+]], $[[R1]], $4
 ; ALL:           nor     $[[R2:[0-9]+]], $zero, $[[R3]]
 ; ALL:           sc      $[[R2]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
 }
 
 define i32 @AtomicSwap32(i32 signext %newval) nounwind {
@@ -82,12 +82,12 @@ entry:
 ; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      ${{[0-9]+}}, 0($[[R0]])
 ; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
 }
 
 define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
@@ -104,16 +104,16 @@ entry:
 ; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $2, 0($[[R0]])
-; NOT-MICROMIPS: bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; MICROMIPS:     bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; MIPSR6:        bnec    $2, $4, $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne     $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MICROMIPS:     bne     $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MIPSR6:        bnec    $2, $4, [[BB1:(\$|\.L)[A-Z_0-9]+]]
 ; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
-; ALL:       $[[BB1]]:
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
+; ALL:       [[BB1]]:
 }
 
 
@@ -141,20 +141,20 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:           addu    $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
 ; ALL:           and     $[[R15:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R14]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
 ; ALL:           and     $[[R17:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:           srlv    $[[R18:[0-9]+]], $[[R17]], $[[R5]]
@@ -186,20 +186,20 @@ entry:
 ; ALL:        nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:        sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:    $[[BB0:[A-Z_0-9]+]]:
+; ALL:    [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:        ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:        subu    $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:        and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
 ; ALL:        and     $[[R15:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:        or      $[[R16:[0-9]+]], $[[R15]], $[[R14]]
 ; ALL:        sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:  beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:     beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:  beqzc   $[[R16]], [[BB0]]
+; MIPSR6:     beqzc   $[[R16]], [[BB0]]
 
 ; ALL:        and     $[[R17:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:        srlv    $[[R18:[0-9]+]], $[[R17]], $[[R5]]
@@ -231,11 +231,11 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:           nor     $[[R14:[0-9]+]], $zero, $[[R13]]
@@ -243,9 +243,9 @@ entry:
 ; ALL:           and     $[[R16:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:           or      $[[R17:[0-9]+]], $[[R16]], $[[R15]]
 ; ALL:           sc      $[[R17]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R17]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R17]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R17]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R17]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R17]], [[BB0]]
+; MIPSR6:        beqzc   $[[R17]], [[BB0]]
 
 ; ALL:           and     $[[R18:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:           srlv    $[[R19:[0-9]+]], $[[R18]], $[[R5]]
@@ -277,15 +277,15 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R18:[0-9]+]], $[[R9]], $[[R7]]
 ; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
 ; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
 ; ALL:           sc      $[[R14]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R14]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R14]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R14]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R14]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R14]], [[BB0]]
+; MIPSR6:        beqzc   $[[R14]], [[BB0]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
 ; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -322,21 +322,21 @@ entry:
 ; ALL:           andi    $[[R11:[0-9]+]], $5, 255
 ; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MICROMIPS:     bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MIPSR6:        bnec    $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MICROMIPS:     bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MIPSR6:        bnec    $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
-; ALL:       $[[BB1]]:
+; ALL:       [[BB1]]:
 ; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
 
 ; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
@@ -366,21 +366,21 @@ entry:
 ; ALL:           andi    $[[R11:[0-9]+]], $6, 255
 ; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MICROMIPS:     bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-; MIPSR6:        bnec    $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MICROMIPS:     bne     $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
+; MIPSR6:        bnec    $[[R14]], $[[R10]], [[BB1:(\$|\.L)[A-Z_0-9]+]]
 
 ; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
-; ALL:       $[[BB1]]:
+; ALL:       [[BB1]]:
 ; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
 
 ; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
@@ -423,20 +423,20 @@ entry:
 ; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
 ; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
 
-; O0:        $[[BB0:[A-Z_0-9]+]]:
+; O0:        [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; O0:            ld      $[[R10:[0-9]+]]
 ; O0-NEXT:       ll      $[[R11:[0-9]+]], 0($[[R10]])
 
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R12:[0-9]+]], 0($[[R2]])
 ; ALL:           addu    $[[R13:[0-9]+]], $[[R12]], $[[R9]]
 ; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
 ; ALL:           and     $[[R15:[0-9]+]], $[[R12]], $[[R8]]
 ; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R14]]
 ; ALL:           sc      $[[R16]], 0($[[R2]])
-; NOT-MICROMIPS: beqz    $[[R16]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R16]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R16]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R16]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R16]], [[BB0]]
+; MIPSR6:        beqzc   $[[R16]], [[BB0]]
 
 ; ALL:           and     $[[R17:[0-9]+]], $[[R12]], $[[R7]]
 ; ALL:           srlv    $[[R18:[0-9]+]], $[[R17]], $[[R5]]
@@ -465,15 +465,15 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
 ; ALL:           sync
 
 ; ALL:           andi    $[[R3:[0-9]+]], $[[R2]], 65535
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R4:[0-9]+]], 0($[[R5:[0-9]+]])
 ; ALL:           and     $[[R6:[0-9]+]], $[[R4]], $
 ; ALL:           and     $[[R7:[0-9]+]], $[[R4]], $
 ; ALL:           or      $[[R8:[0-9]+]], $[[R7]], $
 ; ALL:           sc      $[[R8]], 0($[[R5]])
-; NOT-MICROMIPS: beqz    $[[R8]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R8]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R8]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R8]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R8]], [[BB0]]
+; MIPSR6:        beqzc   $[[R8]], [[BB0]]
 
 ; ALL:           srlv    $[[R9:[0-9]+]], $[[R6]], $
 
@@ -538,11 +538,11 @@ entry:
 ; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
 
 ; ALL:           addiu   $[[PTR:[0-9]+]], $[[R0]], 1024
-; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:       [[BB0:(\$|\.L)[A-Z_0-9]+]]:
 ; ALL:           ll      $[[R1:[0-9]+]], 0($[[PTR]])
 ; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
 ; ALL:           sc      $[[R2]], 0($[[PTR]])
-; NOT-MICROMIPS: beqz    $[[R2]], $[[BB0]]
-; MICROMIPS:     beqzc   $[[R2]], $[[BB0]]
-; MIPSR6:        beqzc   $[[R2]], $[[BB0]]
+; NOT-MICROMIPS: beqz    $[[R2]], [[BB0]]
+; MICROMIPS:     beqzc   $[[R2]], [[BB0]]
+; MIPSR6:        beqzc   $[[R2]], [[BB0]]
 }
diff --git a/test/CodeGen/Mips/blez_bgez.ll b/test/CodeGen/Mips/blez_bgez.ll
index dcda047f8d09..84c8af45db81 100644
--- a/test/CodeGen/Mips/blez_bgez.ll
+++ b/test/CodeGen/Mips/blez_bgez.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=mips64el < %s | FileCheck %s
 
 ; CHECK-LABEL: test_blez:
-; CHECK: blez ${{[0-9]+}}, $BB
+; CHECK: blez ${{[0-9]+}}, {{\$|\.L}}BB
 
 define void @test_blez(i32 %a) {
 entry:
@@ -20,7 +20,7 @@ if.end:
 declare void @foo1()
 
 ; CHECK-LABEL: test_bgez:
-; CHECK: bgez ${{[0-9]+}}, $BB
+; CHECK: bgez ${{[0-9]+}}, {{\$|\.L}}BB
 
 define void @test_bgez(i32 %a) {
 entry:
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index f74363702af5..9bc9a305a204 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -22,22 +22,22 @@ entry:
 ; STATIC-O32: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T2]])
 ; STATIC-O32: lui   $[[R3:[0-9]+]], %hi($tmp[[T3:[0-9]+]])
 ; STATIC-O32: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T3]])
-; PIC-N32: lw  $[[R0:[0-9]+]], %got_page($tmp[[T0:[0-9]+]])
-; PIC-N32: addiu ${{[0-9]+}}, $[[R0]], %got_ofst($tmp[[T0]])
-; PIC-N32: lw  $[[R1:[0-9]+]], %got_page($tmp[[T1:[0-9]+]])
-; PIC-N32: addiu ${{[0-9]+}}, $[[R1]], %got_ofst($tmp[[T1]])
-; STATIC-N32: lui  $[[R2:[0-9]+]], %hi($tmp[[T2:[0-9]+]])
-; STATIC-N32: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T2]])
-; STATIC-N32: lui   $[[R3:[0-9]+]], %hi($tmp[[T3:[0-9]+]])
-; STATIC-N32: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T3]])
-; PIC-N64: ld  $[[R0:[0-9]+]], %got_page($tmp[[T0:[0-9]+]])
-; PIC-N64: daddiu ${{[0-9]+}}, $[[R0]], %got_ofst($tmp[[T0]])
-; PIC-N64: ld  $[[R1:[0-9]+]], %got_page($tmp[[T1:[0-9]+]])
-; PIC-N64: daddiu ${{[0-9]+}}, $[[R1]], %got_ofst($tmp[[T1]])
-; STATIC-N64: ld  $[[R2:[0-9]+]], %got_page($tmp[[T2:[0-9]+]])
-; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst($tmp[[T2]])
-; STATIC-N64: ld  $[[R3:[0-9]+]], %got_page($tmp[[T3:[0-9]+]])
-; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst($tmp[[T3]])
+; PIC-N32: lw  $[[R0:[0-9]+]], %got_page(.Ltmp[[T0:[0-9]+]])
+; PIC-N32: addiu ${{[0-9]+}}, $[[R0]], %got_ofst(.Ltmp[[T0]])
+; PIC-N32: lw  $[[R1:[0-9]+]], %got_page(.Ltmp[[T1:[0-9]+]])
+; PIC-N32: addiu ${{[0-9]+}}, $[[R1]], %got_ofst(.Ltmp[[T1]])
+; STATIC-N32: lui  $[[R2:[0-9]+]], %hi(.Ltmp[[T2:[0-9]+]])
+; STATIC-N32: addiu ${{[0-9]+}}, $[[R2]], %lo(.Ltmp[[T2]])
+; STATIC-N32: lui   $[[R3:[0-9]+]], %hi(.Ltmp[[T3:[0-9]+]])
+; STATIC-N32: addiu ${{[0-9]+}}, $[[R3]], %lo(.Ltmp[[T3]])
+; PIC-N64: ld  $[[R0:[0-9]+]], %got_page(.Ltmp[[T0:[0-9]+]])
+; PIC-N64: daddiu ${{[0-9]+}}, $[[R0]], %got_ofst(.Ltmp[[T0]])
+; PIC-N64: ld  $[[R1:[0-9]+]], %got_page(.Ltmp[[T1:[0-9]+]])
+; PIC-N64: daddiu ${{[0-9]+}}, $[[R1]], %got_ofst(.Ltmp[[T1]])
+; STATIC-N64: ld  $[[R2:[0-9]+]], %got_page(.Ltmp[[T2:[0-9]+]])
+; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst(.Ltmp[[T2]])
+; STATIC-N64: ld  $[[R3:[0-9]+]], %got_page(.Ltmp[[T3:[0-9]+]])
+; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst(.Ltmp[[T3]])
 ; STATIC-MIPS16-1: .ent	f
 ; STATIC-MIPS16-2: .ent	f
 ; STATIC-MIPS16-1: li  $[[R1_16:[0-9]+]], %hi($tmp[[TI_16:[0-9]+]])
diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
index d6d47678590a..9352294991aa 100644
--- a/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/test/CodeGen/Mips/ehframe-indirect.ll
@@ -33,9 +33,15 @@ declare void @foo()
 
 ; ALL: GCC_except_table{{[0-9]+}}:
 ; ALL: .byte 155 # @TType Encoding = indirect pcrel sdata4
-; ALL: $[[PC_LABEL:tmp[0-9]+]]:
-; ALL: .4byte	($_ZTISt9exception.DW.stub)-($[[PC_LABEL]])
-; ALL: $_ZTISt9exception.DW.stub:
+; O32: [[PC_LABEL:\$tmp[0-9]+]]:
+; N32: [[PC_LABEL:\.Ltmp[0-9]+]]:
+; N64: [[PC_LABEL:\.Ltmp[0-9]+]]:
+; O32: .4byte	($_ZTISt9exception.DW.stub)-([[PC_LABEL]])
+; N32: .4byte	.L_ZTISt9exception.DW.stub-[[PC_LABEL]]
+; N64: .4byte	.L_ZTISt9exception.DW.stub-[[PC_LABEL]]
+; O32: $_ZTISt9exception.DW.stub:
+; N32: .L_ZTISt9exception.DW.stub:
+; N64: .L_ZTISt9exception.DW.stub:
 ; O32: .4byte _ZTISt9exception
 ; N32: .4byte _ZTISt9exception
 ; N64: .8byte _ZTISt9exception
diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll
index 142ee1144bbe..bd04ed0211f5 100644
--- a/test/CodeGen/Mips/fcmp.ll
+++ b/test/CodeGen/Mips/fcmp.ll
@@ -1076,12 +1076,12 @@ entry:
 ; 32-CMP-DAG:    bnezc    $[[T4]],
 
 ; 64-C-DAG:      add.s    $[[T0:f[0-9]+]], $f13, $f12
-; 64-C-DAG:      lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-C-DAG:      lwc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI32_0)(
 ; 64-C-DAG:      c.ole.s  $[[T0]], $[[T1]]
 ; 64-C-DAG:      bc1t
 
 ; 64-CMP-DAG:    add.s    $[[T0:f[0-9]+]], $f13, $f12
-; 64-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
+; 64-CMP-DAG:    lwc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI32_0)(
 ; 64-CMP-DAG:    cmp.le.s $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
 ; FIXME: This instruction is redundant.
@@ -1106,8 +1106,8 @@ entry:
 ; MM64R6-DAG:    daddu    $[[T1:[0-9]+]], $[[T0]], $25
 ; MM64R6-DAG:    daddiu   $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f32)))
 ; MM64R6-DAG:    add.s    $[[T3:f[0-9]+]], $f13, $f12
-; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page($CPI32_0)($[[T2]])
-; MM64R6-DAG:    lwc1     $[[T5:f[0-9]+]], %got_ofst($CPI32_0)($[[T4]])
+; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page(.LCPI32_0)($[[T2]])
+; MM64R6-DAG:    lwc1     $[[T5:f[0-9]+]], %got_ofst(.LCPI32_0)($[[T4]])
 ; MM64R6-DAG:    cmp.le.s $[[T6:f[0-9]+]], $[[T3]], $[[T5]]
 ; MM64R6-DAG:    mfc1     $[[T7:[0-9]+]], $[[T6]]
 ; MM64R6-DAG:    andi16   $[[T8:[0-9]+]], $[[T7]], 1
@@ -1145,12 +1145,12 @@ entry:
 ; 32-CMP-DAG:    bnezc    $[[T4]],
 
 ; 64-C-DAG:      add.d    $[[T0:f[0-9]+]], $f13, $f12
-; 64-C-DAG:      ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-C-DAG:      ldc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI33_0)(
 ; 64-C-DAG:      c.ole.d  $[[T0]], $[[T1]]
 ; 64-C-DAG:      bc1t
 
 ; 64-CMP-DAG:    add.d    $[[T0:f[0-9]+]], $f13, $f12
-; 64-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
+; 64-CMP-DAG:    ldc1     $[[T1:f[0-9]+]], %got_ofst(.LCPI33_0)(
 ; 64-CMP-DAG:    cmp.le.d $[[T2:f[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:    mfc1     $[[T3:[0-9]+]], $[[T2]]
 ; FIXME: This instruction is redundant.
@@ -1175,8 +1175,8 @@ entry:
 ; MM64R6-DAG:    daddu    $[[T1:[0-9]+]], $[[T0]], $25
 ; MM64R6-DAG:    daddiu   $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f64)))
 ; MM64R6-DAG:    add.d    $[[T3:f[0-9]+]], $f13, $f12
-; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page($CPI33_0)($[[T2]])
-; MM64R6-DAG:    ldc1     $[[T5:f[0-9]+]], %got_ofst($CPI33_0)($[[T4]])
+; MM64R6-DAG:    ld       $[[T4:[0-9]+]], %got_page(.LCPI33_0)($[[T2]])
+; MM64R6-DAG:    ldc1     $[[T5:f[0-9]+]], %got_ofst(.LCPI33_0)($[[T4]])
 ; MM64R6-DAG:    cmp.le.d $[[T6:f[0-9]+]], $[[T3]], $[[T5]]
 ; MM64R6-DAG:    mfc1     $[[T7:[0-9]+]], $[[T6]]
 ; MM64R6-DAG:    andi16   $[[T8:[0-9]+]], $[[T7]], 1
diff --git a/test/CodeGen/Mips/fpbr.ll b/test/CodeGen/Mips/fpbr.ll
index bf1b045dbf28..7fb508f606b2 100644
--- a/test/CodeGen/Mips/fpbr.ll
+++ b/test/CodeGen/Mips/fpbr.ll
@@ -10,8 +10,9 @@ entry:
 ; ALL-LABEL: func0:
 
 ; 32-FCC:        c.eq.s $f12, $f14
+; 32-FCC:        bc1f   $BB0_2
 ; 64-FCC:        c.eq.s $f12, $f13
-; FCC:           bc1f   $BB0_2
+; 64-FCC:        bc1f   .LBB0_2
 
 ; 32-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f14
 ; 64-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f13
@@ -19,7 +20,7 @@ entry:
 ; FIXME: We ought to be able to transform not+bnez -> beqz
 ; GPR:           not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB0_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB0_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB0_2
 
   %cmp = fcmp oeq float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -45,15 +46,16 @@ entry:
 ; ALL-LABEL: func1:
 
 ; 32-FCC:        c.olt.s $f12, $f14
+; 32-FCC:        bc1f    $BB1_2
 ; 64-FCC:        c.olt.s $f12, $f13
-; FCC:           bc1f    $BB1_2
+; 64-FCC:        bc1f    .LBB1_2
 
 ; 32-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB1_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB1_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB1_2
 
   %cmp = fcmp olt float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -75,15 +77,16 @@ entry:
 ; ALL-LABEL: func2:
 
 ; 32-FCC:        c.ole.s $f12, $f14
+; 32-FCC:        bc1t    $BB2_2
 ; 64-FCC:        c.ole.s $f12, $f13
-; FCC:           bc1t    $BB2_2
+; 64-FCC:        bc1t    .LBB2_2
 
 ; 32-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        beqz     $[[GPRCC]], $BB2_2
-; 64-GPR:        beqzc    $[[GPRCC]], $BB2_2
+; 64-GPR:        beqzc    $[[GPRCC]], .LBB2_2
 
   %cmp = fcmp ugt float %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
@@ -105,8 +108,9 @@ entry:
 ; ALL-LABEL: func3:
 
 ; 32-FCC:        c.eq.d $f12, $f14
+; 32-FCC:        bc1f $BB3_2
 ; 64-FCC:        c.eq.d $f12, $f13
-; FCC:           bc1f $BB3_2
+; 64-FCC:        bc1f .LBB3_2
 
 ; 32-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f14
 ; 64-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f13
@@ -114,7 +118,7 @@ entry:
 ; FIXME: We ought to be able to transform not+bnez -> beqz
 ; GPR:           not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB3_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB3_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB3_2
 
   %cmp = fcmp oeq double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -136,15 +140,16 @@ entry:
 ; ALL-LABEL: func4:
 
 ; 32-FCC:        c.olt.d $f12, $f14
+; 32-FCC:        bc1f $BB4_2
 ; 64-FCC:        c.olt.d $f12, $f13
-; FCC:           bc1f $BB4_2
+; 64-FCC:        bc1f .LBB4_2
 
 ; 32-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        bnez     $[[GPRCC]], $BB4_2
-; 64-GPR:        bnezc    $[[GPRCC]], $BB4_2
+; 64-GPR:        bnezc    $[[GPRCC]], .LBB4_2
 
   %cmp = fcmp olt double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
@@ -166,15 +171,16 @@ entry:
 ; ALL-LABEL: func5:
 
 ; 32-FCC:        c.ole.d $f12, $f14
+; 32-FCC:        bc1t $BB5_2
 ; 64-FCC:        c.ole.d $f12, $f13
-; FCC:           bc1t $BB5_2
+; 64-FCC:        bc1t .LBB5_2
 
 ; 32-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f14, $f12
 ; 64-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f13, $f12
 ; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
 ; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
 ; 32-GPR:        beqz     $[[GPRCC]], $BB5_2
-; 64-GPR:        beqzc    $[[GPRCC]], $BB5_2
+; 64-GPR:        beqzc    $[[GPRCC]], .LBB5_2
 
   %cmp = fcmp ugt double %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
diff --git a/test/CodeGen/Mips/jumptable_labels.ll b/test/CodeGen/Mips/jumptable_labels.ll
new file mode 100644
index 000000000000..8c7edc10689f
--- /dev/null
+++ b/test/CodeGen/Mips/jumptable_labels.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64 -target-abi=n32 < %s | FileCheck %s -check-prefix=N32
+; RUN: llc -march=mips64 < %s | FileCheck %s -check-prefix=N64
+
+; We only use the '$' prefix on O32. The others use the ELF convention.
+; O32: $JTI0_0
+; N32: .LJTI0_0
+; N64: .LJTI0_0
+
+; Check basic block labels while we're at it.
+; O32: $BB0_2:
+; N32: .LBB0_2:
+; N64: .LBB0_2:
+
+@.str = private unnamed_addr constant [2 x i8] c"A\00", align 1
+@.str.1 = private unnamed_addr constant [2 x i8] c"B\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"C\00", align 1
+@.str.3 = private unnamed_addr constant [2 x i8] c"D\00", align 1
+@.str.4 = private unnamed_addr constant [2 x i8] c"E\00", align 1
+@.str.5 = private unnamed_addr constant [2 x i8] c"F\00", align 1
+@.str.6 = private unnamed_addr constant [2 x i8] c"G\00", align 1
+@.str.7 = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+
+define i8* @_Z3fooi(i32 signext %Letter) {
+entry:
+  %retval = alloca i8*, align 8
+  %Letter.addr = alloca i32, align 4
+  store i32 %Letter, i32* %Letter.addr, align 4
+  %0 = load i32, i32* %Letter.addr, align 4
+  switch i32 %0, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+  ]
+
+sw.bb:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb1:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb2:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb3:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.3, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb4:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.4, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb5:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.5, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.bb6:
+  store i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.6, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+sw.epilog:
+  store i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str.7, i32 0, i32 0), i8** %retval, align 8
+  br label %return
+
+return:
+  %1 = load i8*, i8** %retval, align 8
+  ret i8* %1
+}
diff --git a/test/CodeGen/Mips/llvm-ir/ashr.ll b/test/CodeGen/Mips/llvm-ir/ashr.ll
index af9b81f9203f..cfb9855e6438 100644
--- a/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -167,18 +167,18 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrav     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             $[[BB0]]:
-  ; M3:             beqz      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             [[BB0]]:
+  ; M3:             beqz      $[[T3]], [[BB1:.LBB[0-9_]+]]
   ; M3:             nop
   ; M3:             dsra      $2, $4, 63
-  ; M3:             $[[BB1]]:
+  ; M3:             [[BB1]]:
   ; M3:             jr        $ra
   ; M3:             nop
 
diff --git a/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
index d982b570d7c2..8fed32aee9be 100644
--- a/test/CodeGen/Mips/llvm-ir/indirectbr.ll
+++ b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
@@ -18,13 +18,13 @@ define i32 @br(i8 *%addr) {
 ; R6C:           jrc $4 # <MCInst #{{[0-9]+}} JIC
 
 
-; ALL: $BB0_1: # %L1
+; ALL: {{\$|\.L}}BB0_1: # %L1
 ; NOT-R6:        jr $ra # <MCInst #{{[0-9]+}} JR
 ; R6:            jr $ra # <MCInst #{{[0-9]+}} JALR
 ; R6C:           jr $ra # <MCInst #{{[0-9]+}} JALR
 ; ALL:           addiu $2, $zero, 0
 
-; ALL: $BB0_2: # %L2
+; ALL: {{\$|\.L}}BB0_2: # %L2
 ; NOT-R6:        jr $ra # <MCInst #{{[0-9]+}} JR
 ; R6:            jr $ra # <MCInst #{{[0-9]+}} JALR
 ; R6C:           jr $ra # <MCInst #{{[0-9]+}} JALR
diff --git a/test/CodeGen/Mips/llvm-ir/lshr.ll b/test/CodeGen/Mips/llvm-ir/lshr.ll
index 10748b9c803a..63fb075a4ad6 100644
--- a/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -158,18 +158,18 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrlv     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             $[[BB0]]:
-  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             [[BB0]]:
+  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
   ; M3:             daddiu    $2, $zero, 0
   ; M3:             move      $2, $[[T1]]
-  ; M3:             $[[BB1]]:
+  ; M3:             [[BB1]]:
   ; M3:             jr        $ra
   ; M3:             nop
 
diff --git a/test/CodeGen/Mips/llvm-ir/select-dbl.ll b/test/CodeGen/Mips/llvm-ir/select-dbl.ll
index 1ca5b4e054ba..42f02c4cbabf 100644
--- a/test/CodeGen/Mips/llvm-ir/select-dbl.ll
+++ b/test/CodeGen/Mips/llvm-ir/select-dbl.ll
@@ -58,10 +58,10 @@ entry:
   ; SEL-32:     sel.d   $f0, $[[F1]], $[[F0]]
 
   ; M3:         andi    $[[T0:[0-9]+]], $4, 1
-  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:.LBB[0-9_]+]]
   ; M3:         nop
   ; M3:         mov.d   $f13, $f14
-  ; M3:         $[[BB0]]:
+  ; M3:         [[BB0]]:
   ; M3:         jr      $ra
   ; M3:         mov.d   $f0, $f13
 
@@ -106,10 +106,10 @@ entry:
   ; SEL-32:     sel.d   $f0, $f14, $f12
 
   ; M3:         andi    $[[T0:[0-9]+]], $6, 1
-  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M3:         nop
   ; M3:         mov.d   $f12, $f13
-  ; M3:         $[[BB0]]:
+  ; M3:         [[BB0]]:
   ; M3:         jr      $ra
   ; M3:         mov.d   $f0, $f12
 
@@ -135,11 +135,12 @@ entry:
 
   ; M2:         c.olt.d   $f12, $f14
   ; M3:         c.olt.d   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -172,11 +173,12 @@ entry:
 
   ; M2:         c.ole.d   $f12, $f14
   ; M3:         c.ole.d   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -209,11 +211,12 @@ entry:
 
   ; M2:         c.ule.d   $f12, $f14
   ; M3:         c.ule.d   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -246,11 +249,12 @@ entry:
 
   ; M2:         c.ult.d   $f12, $f14
   ; M3:         c.ult.d   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -283,11 +287,12 @@ entry:
 
   ; M2:         c.eq.d    $f12, $f14
   ; M3:         c.eq.d    $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
@@ -320,11 +325,12 @@ entry:
 
   ; M2:         c.ueq.d   $f12, $f14
   ; M3:         c.ueq.d   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.d     $f12, $f14
   ; M3:         mov.d     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.d     $f0, $f12
 
diff --git a/test/CodeGen/Mips/llvm-ir/select-flt.ll b/test/CodeGen/Mips/llvm-ir/select-flt.ll
index 6a0334da4833..e632897e76cd 100644
--- a/test/CodeGen/Mips/llvm-ir/select-flt.ll
+++ b/test/CodeGen/Mips/llvm-ir/select-flt.ll
@@ -34,12 +34,13 @@ entry:
   ; ALL-LABEL: tst_select_i1_float:
 
   ; M2-M3:      andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:      bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:         bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         jr      $ra
   ; M2:         mtc1    $6, $f0
   ; M3:         mov.s   $f13, $f14
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr      $ra
   ; M2:         mtc1    $5, $f0
   ; M3:         mov.s   $f0, $f13
@@ -76,11 +77,12 @@ entry:
   ; ALL-LABEL: tst_select_i1_float_reordered:
 
   ; M2-M3:      andi    $[[T0:[0-9]+]], $6, 1
-  ; M2-M3:      bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:         bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s   $f12, $f14
   ; M3:         mov.s   $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr      $ra
   ; M2-M3:      mov.s   $f0, $f12
 
@@ -112,11 +114,12 @@ entry:
 
   ; M2:         c.olt.s   $f12, $f14
   ; M3:         c.olt.s   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -149,11 +152,12 @@ entry:
 
   ; M2:         c.ole.s   $f12, $f14
   ; M3:         c.ole.s   $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -186,11 +190,12 @@ entry:
 
   ; M2:         c.ule.s   $f12, $f14
   ; M3:         c.ule.s   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -223,11 +228,12 @@ entry:
 
   ; M2:         c.ult.s   $f12, $f14
   ; M3:         c.ult.s   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -260,11 +266,12 @@ entry:
 
   ; M2:         c.eq.s    $f12, $f14
   ; M3:         c.eq.s    $f12, $f13
-  ; M2-M3:      bc1t      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1t      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1t      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
@@ -297,11 +304,12 @@ entry:
 
   ; M2:         c.ueq.s   $f12, $f14
   ; M3:         c.ueq.s   $f12, $f13
-  ; M2-M3:      bc1f      $[[BB0:BB[0-9_]+]]
+  ; M2:         bc1f      [[BB0:\$BB[0-9_]+]]
+  ; M3:         bc1f      [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:      nop
   ; M2:         mov.s     $f12, $f14
   ; M3:         mov.s     $f12, $f13
-  ; M2-M3:      $[[BB0]]:
+  ; M2-M3:      [[BB0]]:
   ; M2-M3:      jr        $ra
   ; M2-M3:      mov.s     $f0, $f12
 
diff --git a/test/CodeGen/Mips/llvm-ir/select-int.ll b/test/CodeGen/Mips/llvm-ir/select-int.ll
index e8f78ffdcb6a..5bee3f1dbd52 100644
--- a/test/CodeGen/Mips/llvm-ir/select-int.ll
+++ b/test/CodeGen/Mips/llvm-ir/select-int.ll
@@ -35,10 +35,11 @@ entry:
   ; ALL-LABEL: tst_select_i1_i1:
 
   ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:     bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:     bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:  nop
   ; M2-M3:  move    $5, $6
-  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  [[BB0]]:
   ; M2-M3:  jr      $ra
   ; M2-M3:  move    $2, $5
 
@@ -70,10 +71,11 @@ entry:
   ; ALL-LABEL: tst_select_i1_i8:
 
   ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:     bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:     bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:  nop
   ; M2-M3:  move    $5, $6
-  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  [[BB0]]:
   ; M2-M3:  jr      $ra
   ; M2-M3:  move    $2, $5
 
@@ -105,10 +107,11 @@ entry:
   ; ALL-LABEL: tst_select_i1_i32:
 
   ; M2-M3:  andi    $[[T0:[0-9]+]], $4, 1
-  ; M2-M3:  bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M2:     bnez    $[[T0]], [[BB0:\$BB[0-9_]+]]
+  ; M3:     bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M2-M3:  nop
   ; M2-M3:  move    $5, $6
-  ; M2-M3:  $[[BB0]]:
+  ; M2-M3:  [[BB0]]:
   ; M2-M3:  jr      $ra
   ; M2-M3:  move    $2, $5
 
@@ -170,10 +173,10 @@ entry:
   ; SEL-32:     or      $3, $[[T4]], $[[T6]]
 
   ; M3:         andi    $[[T0:[0-9]+]], $4, 1
-  ; M3:         bnez    $[[T0]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T0]], [[BB0:\.LBB[0-9_]+]]
   ; M3:         nop
   ; M3:         move    $5, $6
-  ; M3:         $[[BB0]]:
+  ; M3:         [[BB0]]:
   ; M3:         jr      $ra
   ; M3:         move    $2, $5
 
@@ -214,19 +217,19 @@ define i8* @tst_select_word_cst(i8* %a, i8* %b) {
   ; M2:         addiu   $[[T0:[0-9]+]], $zero, -1
   ; M2:         xor     $[[T1:[0-9]+]], $5, $[[T0]]
   ; M2:         sltu    $[[T2:[0-9]+]], $zero, $[[T1]]
-  ; M2:         bnez    $[[T2]], $[[BB0:BB[0-9_]+]]
+  ; M2:         bnez    $[[T2]], [[BB0:\$BB[0-9_]+]]
   ; M2:         addiu   $2, $zero, 0
   ; M2:         move    $2, $4
-  ; M2: $[[BB0]]:
+  ; M2: [[BB0]]:
   ; M2:         jr      $ra
 
   ; M3:         daddiu  $[[T0:[0-9]+]], $zero, -1
   ; M3:         xor     $[[T1:[0-9]+]], $5, $[[T0]]
   ; M3:         sltu    $[[T2:[0-9]+]], $zero, $[[T1]]
-  ; M3:         bnez    $[[T2]], $[[BB0:BB[0-9_]+]]
+  ; M3:         bnez    $[[T2]], [[BB0:\.LBB[0-9_]+]]
   ; M3:         daddiu  $2, $zero, 0
   ; M3:         move    $2, $4
-  ; M3: $[[BB0]]:
+  ; M3: [[BB0]]:
   ; M3:         jr      $ra
 
   ; CMOV-32:    addiu   $[[T0:[0-9]+]], $zero, -1
diff --git a/test/CodeGen/Mips/llvm-ir/shl.ll b/test/CodeGen/Mips/llvm-ir/shl.ll
index fa43840a8b7b..74b6155032a3 100644
--- a/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -174,18 +174,18 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsllv     $[[T1:[0-9]+]], $5, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
   ; M3:             move      $2, $[[T1]]
   ; M3:             dsllv     $[[T4:[0-9]+]], $4, $7
   ; M3:             dsrl      $[[T5:[0-9]+]], $5, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsrlv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $2, $[[T4]], $[[T7]]
-  ; M3:             $[[BB0]]:
-  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             [[BB0]]:
+  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
   ; M3:             daddiu    $3, $zero, 0
   ; M3:             move      $3, $[[T1]]
-  ; M3:             $[[BB1]]:
+  ; M3:             [[BB1]]:
   ; M3:             jr        $ra
   ; M3:             nop
 
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 06eda11e7888..59e284165d40 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -84,28 +84,28 @@ end:
 ; Check the MIPS64 version.
 
 ; N64:        lui     $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1)))
-; N64:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; N64:        bnez    $4, [[BB0:\.LBB[0-9_]+]]
 ; N64:        daddu   $[[R1:[0-9]+]], $[[R0]], $25
 
 ; Check for long branch expansion:
 ; N64:           daddiu  $sp, $sp, -16
 ; N64-NEXT:      sd      $ra, 0($sp)
-; N64-NEXT:      daddiu  $1, $zero, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; N64-NEXT:      daddiu  $1, $zero, %hi([[BB2:\.LBB[0-9_]+]]-[[BB1:\.LBB[0-9_]+]])
 ; N64-NEXT:      dsll    $1, $1, 16
-; N64-NEXT:      bal     $[[BB1]]
-; N64-NEXT:      daddiu  $1, $1, %lo(($[[BB2]])-($[[BB1]]))
-; N64-NEXT:  $[[BB1]]:
+; N64-NEXT:      bal     [[BB1]]
+; N64-NEXT:      daddiu  $1, $1, %lo([[BB2]]-[[BB1]])
+; N64-NEXT:  [[BB1]]:
 ; N64-NEXT:      daddu   $1, $ra, $1
 ; N64-NEXT:      ld      $ra, 0($sp)
 ; N64-NEXT:      jr      $1
 ; N64-NEXT:      daddiu  $sp, $sp, 16
 
-; N64:   $[[BB0]]:
+; N64:   [[BB0]]:
 ; N64:        daddiu  $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1)))
 ; N64:        ld      $[[R2:[0-9]+]], %got_disp(x)($[[GP]])
 ; N64:        addiu   $[[R3:[0-9]+]], $zero, 1
 ; N64:        sw      $[[R3]], 0($[[R2]])
-; N64:   $[[BB2]]:
+; N64:   [[BB2]]:
 ; N64:        jr      $ra
 ; N64:        nop
 
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
index 5d253d7af253..d7a05800a273 100644
--- a/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -36,14 +36,14 @@ define void @const_v16i8() nounwind {
 
   store volatile <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 31>, <16 x i8>*@v16i8
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6>, <16 x i8>*@v16i8
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0>, <16 x i8>*@v16i8
@@ -59,8 +59,8 @@ define void @const_v16i8() nounwind {
 
   store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <16 x i8>*@v16i8
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -77,8 +77,8 @@ define void @const_v8i16() nounwind {
 
   store volatile <8 x i16> <i16 1, i16 1, i16 1, i16 2, i16 1, i16 1, i16 1, i16 31>, <8 x i16>*@v8i16
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.h  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>, <8 x i16>*@v8i16
@@ -93,8 +93,8 @@ define void @const_v8i16() nounwind {
 
   store volatile <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>, <8 x i16>*@v8i16
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.h  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -111,8 +111,8 @@ define void @const_v4i32() nounwind {
 
   store volatile <4 x i32> <i32 1, i32 1, i32 1, i32 31>, <4 x i32>*@v4i32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x i32> <i32 16843009, i32 16843009, i32 16843009, i32 16843009>, <4 x i32>*@v4i32
@@ -123,14 +123,14 @@ define void @const_v4i32() nounwind {
 
   store volatile <4 x i32> <i32 1, i32 2, i32 1, i32 2>, <4 x i32>*@v4i32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x i32> <i32 3, i32 4, i32 5, i32 6>, <4 x i32>*@v4i32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -156,15 +156,15 @@ define void @const_v2i64() nounwind {
 
   store volatile <2 x i64> <i64 1, i64 31>, <2 x i64>*@v2i64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; MIPS32: ld.w [[R1:\$w[0-9]+]], 0([[G_PTR]])
   ; MIPS64: ld.d [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x i64> <i64 3, i64 4>, <2 x i64>*@v2i64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; MIPS32: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
   ; MIPS64: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
index d714b3eec1f2..15468781f308 100644
--- a/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -23,8 +23,8 @@ define void @const_v4f32() nounwind {
 
   store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 31.0>, <4 x float>*@v4f32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x float> <float 65537.0, float 65537.0, float 65537.0, float 65537.0>, <4 x float>*@v4f32
@@ -34,14 +34,14 @@ define void @const_v4f32() nounwind {
 
   store volatile <4 x float> <float 1.0, float 2.0, float 1.0, float 2.0>, <4 x float>*@v4f32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <4 x float> <float 3.0, float 4.0, float 5.0, float 6.0>, <4 x float>*@v4f32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
@@ -55,38 +55,38 @@ define void @const_v2f64() nounwind {
 
   store volatile <2 x double> <double 72340172838076673.0, double 72340172838076673.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 281479271743489.0, double 281479271743489.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 4294967297.0, double 4294967297.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 1.0, double 1.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 1.0, double 31.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   store volatile <2 x double> <double 3.0, double 4.0>, <2 x double>*@v2f64
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst($
+  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
+  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
   ; ALL: ld.d  [[R1:\$w[0-9]+]], 0([[G_PTR]])
 
   ret void
diff --git a/test/CodeGen/Mips/octeon.ll b/test/CodeGen/Mips/octeon.ll
index b441274cec6b..7e2a81019ac4 100644
--- a/test/CodeGen/Mips/octeon.ll
+++ b/test/CodeGen/Mips/octeon.ll
@@ -91,9 +91,9 @@ entry:
 define i64 @bbit0(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit0:
-; OCTEON: bbit0   $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit0   $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: andi  $[[T0:[0-9]+]], $4, 8
-; MIPS64: bnez  $[[T0]], $[[BB0:BB[0-9_]+]]
+; MIPS64: bnez  $[[T0]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 8
   %res = icmp eq i64 %bit, 0
   br i1 %res, label %endif, label %if
@@ -107,11 +107,11 @@ endif:
 define i64 @bbit032(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit032:
-; OCTEON: bbit032 $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit032 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: daddiu  $[[T0:[0-9]+]], $zero, 1
 ; MIPS64: dsll    $[[T1:[0-9]+]], $[[T0]], 35
 ; MIPS64: and     $[[T2:[0-9]+]], $4, $[[T1]]
-; MIPS64: bnez    $[[T2]], $[[BB0:BB[0-9_]+]]
+; MIPS64: bnez    $[[T2]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 34359738368
   %res = icmp eq i64 %bit, 0
   br i1 %res, label %endif, label %if
@@ -125,9 +125,9 @@ endif:
 define i64 @bbit1(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit1:
-; OCTEON: bbit1 $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit1 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: andi  $[[T0:[0-9]+]], $4, 8
-; MIPS64: beqz  $[[T0]], $[[BB0:BB[0-9_]+]]
+; MIPS64: beqz  $[[T0]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 8
   %res = icmp ne i64 %bit, 0
   br i1 %res, label %endif, label %if
@@ -141,11 +141,11 @@ endif:
 define i64 @bbit132(i64 %a) nounwind {
 entry:
 ; ALL-LABEL: bbit132:
-; OCTEON: bbit132 $4, 3, $[[BB0:BB[0-9_]+]]
+; OCTEON: bbit132 $4, 3, [[BB0:(\$|\.L)BB[0-9_]+]]
 ; MIPS64: daddiu  $[[T0:[0-9]+]], $zero, 1
 ; MIPS64: dsll    $[[T1:[0-9]+]], $[[T0]], 35
 ; MIPS64: and     $[[T2:[0-9]+]], $4, $[[T1]]
-; MIPS64: beqz    $[[T2]], $[[BB0:BB[0-9_]+]]
+; MIPS64: beqz    $[[T2]], [[BB0:(\$|\.L)BB[0-9_]+]]
   %bit = and i64 %a, 34359738368
   %res = icmp ne i64 %bit, 0
   br i1 %res, label %endif, label %if
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index c7cf857e1d44..f886e1ff814d 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -681,10 +681,11 @@ define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
 ; X64-NEXT:    vcvttpd2dqy %ymm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
-  %cvt = fptosi <4 x double> %a0 to <4 x i32>
+  %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
   %res = bitcast <4 x i32> %cvt to <2 x i64>
   ret <2 x i64> %res
 }
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
 
 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
 ; X32-LABEL: test_mm256_cvttps_epi32:
@@ -696,10 +697,11 @@ define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    vcvttps2dq %ymm0, %ymm0
 ; X64-NEXT:    retq
-  %cvt = fptosi <8 x float> %a0 to <8 x i32>
+  %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
   %res = bitcast <8 x i32> %cvt to <4 x i64>
   ret <4 x i64> %res
 }
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 
 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 ; X32-LABEL: test_mm256_div_pd:
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index a7b4c6b285d8..0630fd8a93ca 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -359,35 +359,12 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
 declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
 
 
-define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvttpd2dqy %ymm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retl
-  %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT:    retl
-  %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
-
-
 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
   ; add operation forces the execution domain.
 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    vpaddb LCPI34_0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb LCPI32_0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqu %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 35763297d816..c5d60da8f900 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -3431,6 +3431,39 @@ define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
 
 
+define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcvttpd2dqy %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vcvttpd2dqy %ymm0, %xmm0
+; AVX512VL-NEXT:    retl
+  %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
+; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX-NEXT:    retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX512VL-NEXT:    retl
+  %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
+
+
 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
 ; AVX-LABEL: test_x86_avx_dp_ps_256:
 ; AVX:       ## BB#0:
@@ -4552,7 +4585,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
 ; AVX-LABEL: movnt_dq:
 ; AVX:       ## BB#0:
 ; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-NEXT:    vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq LCPI256_0, %xmm0, %xmm0
 ; AVX-NEXT:    vmovntdq %ymm0, (%eax)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retl
@@ -4560,7 +4593,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
 ; AVX512VL-LABEL: movnt_dq:
 ; AVX512VL:       ## BB#0:
 ; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512VL-NEXT:    vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddq LCPI256_0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovntdq %ymm0, (%eax)
 ; AVX512VL-NEXT:    retl
   %a2 = add <2 x i64> %a1, <i64 1, i64 1>
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 914f859927be..d2410e4a0a5d 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -744,6 +744,36 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
   ret <8 x double> %1
 }
 
+define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
+; KNL-LABEL: sitofp_16i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; KNL-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; KNL-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k2} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; KNL-NEXT:    vmovdqa64 %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm1, %ymm1
+; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_16i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; SKX-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; SKX-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; SKX-NEXT:    vpmovm2d %k1, %ymm0
+; SKX-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; SKX-NEXT:    vpmovm2d %k0, %ymm1
+; SKX-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %cmpres to <16 x double>
+  ret <16 x double> %1
+}
+
 define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; KNL-LABEL: sitofp_8i1_double:
 ; KNL:       ## BB#0:
@@ -767,6 +797,130 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
   ret <8 x double> %1
 }
 
+define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
+; KNL-LABEL: sitofp_8i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vcmpltps %zmm0, %zmm1, %k1
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_8i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vcmpltps %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %ymm0
+; SKX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x float>
+  ret <8 x float> %1
+}
+
+define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
+; KNL-LABEL: sitofp_4i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_4i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
+; KNL-LABEL: sitofp_4i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vpslld $31, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
+; KNL-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_4i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
+; KNL-LABEL: sitofp_2i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; KNL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; KNL-NEXT:    vpsrad $31, %xmm0, %xmm1
+; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    xorl %ecx, %ecx
+; KNL-NEXT:    testb $1, %al
+; KNL-NEXT:    movl $-1, %eax
+; KNL-NEXT:    movl $0, %edx
+; KNL-NEXT:    cmovnel %eax, %edx
+; KNL-NEXT:    vcvtsi2ssl %edx, %xmm0, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rdx
+; KNL-NEXT:    testb $1, %dl
+; KNL-NEXT:    cmovnel %eax, %ecx
+; KNL-NEXT:    vcvtsi2ssl %ecx, %xmm0, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_2i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
+; KNL-LABEL: sitofp_2i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: sitofp_2i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; SKX-NEXT:    vpmovm2q %k0, %xmm0
+; SKX-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x double>
+  ret <2 x double> %1
+}
+
 define <16 x float> @uitofp_16i8(<16 x i8>%a) {
 ; ALL-LABEL: uitofp_16i8:
 ; ALL:       ## BB#0:
@@ -787,3 +941,196 @@ define <16 x float> @uitofp_16i16(<16 x i16>%a) {
   ret <16 x float>%b
 }
 
+define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
+; ALL-LABEL: uitofp_16i1_float:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; ALL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; ALL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
+; KNL-LABEL: uitofp_16i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT:    movq {{.*}}(%rip), %rax
+; KNL-NEXT:    vpbroadcastq %rax, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT:    kshiftrw $8, %k1, %k1
+; KNL-NEXT:    vpbroadcastq %rax, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm1, %ymm1
+; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_16i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; SKX-NEXT:    movl {{.*}}(%rip), %eax
+; SKX-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    vpbroadcastd %eax, %ymm1 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; SKX-NEXT:    retq
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x double>
+  ret <16 x double> %1
+}
+
+define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
+; KNL-LABEL: uitofp_8i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_8i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
+; KNL-LABEL: uitofp_8i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    vpmovqd %zmm0, %ymm0
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_8i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x double>
+  ret <8 x double> %1
+}
+
+define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
+; KNL-LABEL: uitofp_4i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpsrld $31, %xmm0, %xmm0
+; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_4i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
+; KNL-LABEL: uitofp_4i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpsrld $31, %xmm0, %xmm0
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_4i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; SKX-NEXT:    retq
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
+; KNL-LABEL: uitofp_2i1_float:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_2i1_float:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp ult <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
+; KNL-LABEL: uitofp_2i1_double:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtusi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtusi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: uitofp_2i1_double:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; SKX-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp ult <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x double>
+  ret <2 x double> %1
+}
diff --git a/test/CodeGen/X86/pr28504.ll b/test/CodeGen/X86/pr28504.ll
new file mode 100644
index 000000000000..a617c8aa4f1e
--- /dev/null
+++ b/test/CodeGen/X86/pr28504.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; The test case is rather involved, because we need to get to a state where
+; We have a sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) combine,
+; BUT this combine is only triggered post-legalization, so the setcc's return
+; type is i8. So we can't have the combine opportunity be exposed too early.
+; Basically, what we want to see is that the compare result zero-extended, and 
+; then stored. Only one zext, and no sexts.
+
+; CHECK-LABEL: main:
+; CHECK: movzbl (%rdi), %[[EAX:.*]]
+; CHECK-NEXT: xorl  %e[[C:.]]x, %e[[C]]x
+; CHECK-NEXT: cmpl  $1, %[[EAX]]
+; CHECK-NEXT: sete  %[[C]]l
+; CHECK-NEXT: movl  %e[[C]]x, (%rsi)
+define void @main(i8* %p, i32* %q) {
+bb:
+  %tmp4 = load i8, i8* %p, align 1
+  %tmp5 = sext i8 %tmp4 to i32
+  %tmp6 = load i8, i8* %p, align 1
+  %tmp7 = zext i8 %tmp6 to i32
+  %tmp8 = sub nsw i32 %tmp5, %tmp7
+  %tmp11 = icmp eq i32 %tmp7, 1
+  %tmp12 = zext i1 %tmp11 to i32
+  %tmp13 = add nsw i32 %tmp8, %tmp12
+  %tmp14 = trunc i32 %tmp13 to i8
+  %tmp15 = sext i8 %tmp14 to i16
+  %tmp16 = sext i16 %tmp15 to i32
+  store i32 %tmp16, i32* %q, align 4
+  br i1 %tmp11, label %bb21, label %bb22
+
+bb21:                                             ; preds = %bb
+  unreachable
+
+bb22:                                             ; preds = %bb
+  ret void
+}
diff --git a/test/CodeGen/X86/pr28824.ll b/test/CodeGen/X86/pr28824.ll
new file mode 100644
index 000000000000..ced1f00dd01b
--- /dev/null
+++ b/test/CodeGen/X86/pr28824.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+
+@d = global i32 0, align 4
+
+; Verify the sar happens before ecx is clobbered with the parameter being
+; passed to fn3
+; CHECK-LABEL: fn4
+; CHECK: movb d, %cl
+; CHECK: sarl %cl
+; CHECK: movl $2, %ecx
+define i32 @fn4(i32 %i) #0 {
+entry:
+  %0 = load i32, i32* @d, align 4
+  %shr = ashr i32 %i, %0
+  tail call fastcc void @fn3(i32 2, i32 5, i32 %shr, i32 %i)
+  %cmp = icmp slt i32 %shr, 1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
+declare void @fn3(i32 %p1, i32 %p2, i32 %p3, i32 %p4) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
index 2102b4211153..aad00e71dda0 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
@@ -6,13 +6,12 @@
 define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
 ; X64-LABEL: test_mm_cvtsi64_ss:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2ssq %rdi, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    cvtsi2ssq %rdi, %xmm0
 ; X64-NEXT:    retq
-  %cvt = sitofp i64 %a1 to float
-  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
   ret <4 x float> %res
 }
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
 define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
 ; X64-LABEL: test_mm_cvtss_si64:
@@ -29,7 +28,7 @@ define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttss2si %xmm0, %rax
 ; X64-NEXT:    retq
-  %cvt = extractelement <4 x float> %a0, i32 0
-  %res = fptosi float %cvt to i64
+  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
   ret i64 %res
 }
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 090ddfdfa93a..4715b7f00fcb 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -707,20 +707,17 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
 define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
 ; X32-LABEL: test_mm_cvtsi32_ss:
 ; X32:       # BB#0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    cvtsi2ssl %eax, %xmm1
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtsi32_ss:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2ssl %edi, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    cvtsi2ssl %edi, %xmm0
 ; X64-NEXT:    retq
-  %cvt = sitofp i32 %a1 to float
-  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
   ret <4 x float> %res
 }
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
 
 define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
 ; X32-LABEL: test_mm_cvtss_f32:
@@ -762,10 +759,10 @@ define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    retq
-  %cvt = extractelement <4 x float> %a0, i32 0
-  %res = fptosi float %cvt to i32
+  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
   ret i32 %res
 }
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
 
 define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
 ; X32-LABEL: test_mm_cvttss_si32:
@@ -777,8 +774,7 @@ define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    retq
-  %cvt = extractelement <4 x float> %a0, i32 0
-  %res = fptosi float %cvt to i32
+  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
   ret i32 %res
 }
 
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
index f5ecfa444d86..6b9dc40a59e2 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
@@ -25,13 +25,12 @@ define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
 define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
 ; X64-LABEL: test_mm_cvtsi64_sd:
 ; X64:       # BB#0:
-; X64-NEXT:    cvtsi2sdq %rdi, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT:    cvtsi2sdq %rdi, %xmm0
 ; X64-NEXT:    retq
-  %cvt = sitofp i64 %a1 to double
-  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
   ret <2 x double> %res
 }
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
 
 define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
 ; X64-LABEL: test_mm_cvtsi64_si128:
@@ -48,10 +47,10 @@ define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttsd2si %xmm0, %rax
 ; X64-NEXT:    retq
-  %ext = extractelement <2 x double> %a0, i32 0
-  %res = fptosi double %ext to i64
+  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
   ret i64 %res
 }
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
 
 define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
 ; X64-LABEL: test_mm_loadu_si64:
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index fa71325d7d6e..d3ebba93c769 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1208,6 +1208,39 @@ define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
 }
 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 
+define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_cvtsd_ss:
+; X32:       # BB#0:
+; X32-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsd_ss:
+; X64:       # BB#0:
+; X64-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; X32-LABEL: test_mm_cvtsd_ss_load:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps (%eax), %xmm1
+; X32-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_cvtsd_ss_load:
+; X64:       # BB#0:
+; X64-NEXT:    movaps (%rdi), %xmm1
+; X64-NEXT:    cvtsd2ss %xmm1, %xmm0
+; X64-NEXT:    retq
+  %a1 = load <2 x double>, <2 x double>* %p1
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
+  ret <4 x float> %res
+}
+
 define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
 ; X32-LABEL: test_mm_cvtsi128_si32:
 ; X32:       # BB#0:
@@ -1303,10 +1336,11 @@ define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttps2dq %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %res = fptosi <4 x float> %a0 to <4 x i32>
+  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
   %bc = bitcast <4 x i32> %res to <2 x i64>
   ret <2 x i64> %bc
 }
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
 
 define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
 ; X32-LABEL: test_mm_cvttsd_si32:
@@ -1318,10 +1352,10 @@ define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    retq
-  %ext = extractelement <2 x double> %a0, i32 0
-  %res = fptosi double %ext to i32
+  %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
   ret i32 %res
 }
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
 
 define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; X32-LABEL: test_mm_div_pd:
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index ae6626bb0dc5..27a3fce0be2a 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -66,17 +66,6 @@ define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
 declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
 
 
-define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttps2dq:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    retl
-  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-
 define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storel_dq:
 ; CHECK:       ## BB#0:
@@ -94,7 +83,7 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    paddb LCPI8_0, %xmm0
+; CHECK-NEXT:    paddb LCPI7_0, %xmm0
 ; CHECK-NEXT:    movdqu %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 617e30e4b92c..3ae3aecabaf5 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
 
@@ -274,6 +274,25 @@ define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
 declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
 
 
+define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtsd2ss_load:
+; SSE:       ## BB#0:
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movaps (%eax), %xmm1
+; SSE-NEXT:    cvtsd2ss %xmm1, %xmm0
+; SSE-NEXT:    retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2ss_load:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT:    vcvtsd2ss (%eax), %xmm0, %xmm0
+; KNL-NEXT:    retl
+  %a1 = load <2 x double>, <2 x double>* %p1
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+
+
 define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
 ; SSE-LABEL: test_x86_sse2_cvtsi2sd:
 ; SSE:       ## BB#0:
@@ -306,6 +325,25 @@ define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
 declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
 
 
+define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, <4 x float>* %p1) {
+; SSE-LABEL: test_x86_sse2_cvtss2sd_load:
+; SSE:       ## BB#0:
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movaps (%eax), %xmm1
+; SSE-NEXT:    cvtss2sd %xmm1, %xmm0
+; SSE-NEXT:    retl
+;
+; KNL-LABEL: test_x86_sse2_cvtss2sd_load:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT:    vcvtss2sd (%eax), %xmm0, %xmm0
+; KNL-NEXT:    retl
+  %a1 = load <4 x float>, <4 x float>* %p1
+  %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+
+
 define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
 ; SSE-LABEL: test_x86_sse2_cvttpd2dq:
 ; SSE:       ## BB#0:
@@ -322,6 +360,22 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
 
 
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+; SSE-LABEL: test_x86_sse2_cvttps2dq:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    retl
+;
+; KNL-LABEL: test_x86_sse2_cvttps2dq:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; KNL-NEXT:    retl
+  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
 define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
 ; SSE-LABEL: test_x86_sse2_cvttsd2si:
 ; SSE:       ## BB#0:
diff --git a/test/CodeGen/X86/tail-merge-after-mbp.ll b/test/CodeGen/X86/tail-merge-after-mbp.ll
new file mode 100644
index 000000000000..dc5f3a12bd91
--- /dev/null
+++ b/test/CodeGen/X86/tail-merge-after-mbp.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple=x86_64-linux -o - %s | FileCheck %s
+
+%0 = type { %1, %3* }
+%1 = type { %2* }
+%2 = type { %2*, i8* }
+%3 = type { i32, i32 (i32, i32)* }
+
+
+declare i32 @Up(...) 
+declare i32 @f(i32, i32) 
+
+; check loop block_14 is not merged with block_21
+; check loop block_11 is not merged with block_18, block_25
+define i32 @foo(%0* nocapture readonly, i32, i1 %c, i8* %p1, %2** %p2) {
+; CHECK-LABEL: foo:
+; CHECK:     # %block_11
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        je      
+; CHECK-NEXT:# %block_14
+; CHECK-NEXT:        cmpq    $0, 8(%rax)
+; CHECK-NEXT:        jne    
+; CHECK-NEXT:# %block_18
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        je    
+; CHECK-NEXT:# %block_21
+; CHECK-NEXT:# =>This Inner Loop Header
+; CHECK-NEXT:        cmpq    $0, 8(%rax)
+; CHECK-NEXT:        jne  
+; CHECK-NEXT:# %block_25
+; CHECK-NEXT:#   in Loop
+; CHECK-NEXT:        movq    (%r14), %rax
+; CHECK-NEXT:        testq   %rax, %rax
+; CHECK-NEXT:        jne 
+  br i1 %c, label %block_34, label %block_3
+
+block_3:                                      ; preds = %2
+  br i1 %c, label %block_7, label %block_4
+
+block_4:                                      ; preds = %block_3
+  %a5 = tail call i32 @f(i32 undef, i32 undef)
+  %a6 = icmp eq i32 %a5, 0
+  br i1 %a6, label %block_7, label %block_34
+
+block_7:                                      ; preds = %block_4, %block_3
+  %a8 = icmp eq %2* null, null
+  br i1 %a8, label %block_34, label %block_9
+
+block_9:                                      ; preds = %block_7
+  %a10 = icmp eq i8* %p1, null
+  br i1 %a10, label %block_11, label %block_32
+
+block_11:                                     ; preds = %block_9
+  %a12 = load %2*, %2** %p2, align 8
+  %a13 = icmp eq %2* %a12, null
+  br i1 %a13, label %block_34, label %block_14
+
+block_14:                                     ; preds = %block_11
+  %a15 = getelementptr inbounds %2, %2* %a12, i64 0, i32 1
+  %a16 = load i8*, i8** %a15, align 8
+  %a17 = icmp eq i8* %a16, null
+  br i1 %a17, label %block_18, label %block_32
+
+block_18:                                     ; preds = %block_14
+  %a19 = load %2*, %2** %p2, align 8
+  %a20 = icmp eq %2* %a19, null
+  br i1 %a20, label %block_34, label %block_21
+
+block_21:                                     ; preds = %block_18
+  %a22 = getelementptr inbounds %2, %2* %a19, i64 0, i32 1
+  %a23 = load i8*, i8** %a22, align 8
+  %a24 = icmp eq i8* %a23, null
+  br i1 %a24, label %block_25, label %block_32
+
+block_25:                                     ; preds = %block_28, %block_21
+  %a26 = load %2*, %2** %p2, align 8
+  %a27 = icmp eq %2* %a26, null
+  br i1 %a27, label %block_34, label %block_28
+
+block_28:                                     ; preds = %block_25
+  %a29 = getelementptr inbounds %2, %2* %a26, i64 0, i32 1
+  %a30 = load i8*, i8** %a29, align 8
+  %a31 = icmp eq i8* %a30, null
+  br i1 %a31, label %block_25, label %block_32
+
+block_32:                                     ; preds = %block_28, %block_21, %block_14, %block_9
+  %a33 = tail call i32 (...) @Up()
+  br label %block_34
+
+block_34:                                     ; preds = %block_32, %block_25, %block_18, %block_11, %block_7, %block_4, %2
+  %a35 = phi i32 [ 0, %2 ], [ %a5, %block_4 ], [ 0, %block_7 ], [ 0, %block_11 ], [ 0, %block_32 ], [ 0, %block_18 ], [ 0, %block_25 ]
+  ret i32 %a35
+}
diff --git a/test/CodeGen/X86/twoaddr-lea.ll b/test/CodeGen/X86/twoaddr-lea.ll
index 5779cf33ac84..2944b17c6c16 100644
--- a/test/CodeGen/X86/twoaddr-lea.ll
+++ b/test/CodeGen/X86/twoaddr-lea.ll
@@ -44,3 +44,60 @@ entry:
   %0 = shl i64 %x, 1
   ret i64 %0
 }
+
+@global = external global i32, align 4
+@global2 = external global i64, align 8
+
+; Test that liveness is properly updated and we do not encounter the
+; assert/crash from http://llvm.org/PR28301
+; CHECK-LABEL: ham
+define void @ham() {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi i64 [ %tmp40, %bb9 ], [ 0, %bb ]
+  %tmp2 = phi i32 [ %tmp39, %bb9 ], [ 0, %bb ]
+  %tmp3 = icmp sgt i32 undef, 10
+  br i1 %tmp3, label %bb2, label %bb3
+
+bb2:
+  %tmp6 = load i32, i32* @global, align 4
+  %tmp8 = add nsw i32 %tmp6, %tmp2
+  %tmp9 = sext i32 %tmp8 to i64
+  br label %bb6
+
+bb3:
+; CHECK: subl %e[[REG0:[a-z0-9]+]],
+; CHECK: leaq 4({{%[a-z0-9]+}}), %r[[REG0]]
+  %tmp14 = phi i64 [ %tmp15, %bb5 ], [ 0, %bb1 ]
+  %tmp15 = add nuw i64 %tmp14, 4
+  %tmp16 = trunc i64 %tmp14 to i32
+  %tmp17 = sub i32 %tmp2, %tmp16
+  br label %bb4
+
+bb4:
+  %tmp20 = phi i64 [ %tmp14, %bb3 ], [ %tmp34, %bb5 ]
+  %tmp28 = icmp eq i32 %tmp17, 0
+  br i1 %tmp28, label %bb5, label %bb8
+
+bb5:
+  %tmp34 = add nuw nsw i64 %tmp20, 1
+  %tmp35 = icmp slt i64 %tmp34, %tmp15
+  br i1 %tmp35, label %bb4, label %bb3
+
+bb6:
+  store volatile i64 %tmp, i64* @global2, align 8
+  store volatile i64 %tmp9, i64* @global2, align 8
+  store volatile i32 %tmp6, i32* @global, align 4
+  %tmp45 = icmp slt i32 undef, undef
+  br i1 %tmp45, label %bb6, label %bb9
+
+bb8:
+  unreachable
+
+bb9:
+  %tmp39 = add nuw nsw i32 %tmp2, 4
+  %tmp40 = add nuw i64 %tmp, 4
+  br label %bb1
+}