106 files changed, 3378 insertions, 1601 deletions
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index c806b4a7060d1..ce913d211ae27 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -53,9 +53,7 @@ body: |
     ; CHECK: %7(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
     %7(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
 
-    ; CHECK: [[OFFSET0:%[0-9]+]](s64) = G_CONSTANT i64 0
-    ; CHECK: [[GEP0:%[0-9]+]](p0) = G_GEP %0, [[OFFSET0]](s64)
-    ; CHECK: [[LOAD0:%[0-9]+]](s64) = G_LOAD [[GEP0]](p0) :: (load 16 from %ir.addr)
+    ; CHECK: [[LOAD0:%[0-9]+]](s64) = G_LOAD %0(p0) :: (load 16 from %ir.addr)
     ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 8
     ; CHECK: [[GEP1:%[0-9]+]](p0) = G_GEP %0, [[OFFSET1]](s64)
     ; CHECK: [[LOAD1:%[0-9]+]](s64) = G_LOAD [[GEP1]](p0) :: (load 16 from %ir.addr)
@@ -105,9 +103,7 @@ body: |
     ; CHECK: G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr)
     G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr)
 
-    ; CHECK: [[OFFSET0:%[0-9]+]](s64) = G_CONSTANT i64 0
-    ; CHECK: [[GEP0:%[0-9]+]](p0) = G_GEP %0, [[OFFSET0]](s64)
-    ; CHECK: G_STORE %5(s64), [[GEP0]](p0) :: (store 16 into %ir.addr)
+    ; CHECK: G_STORE %5(s64), %0(p0) :: (store 16 into %ir.addr)
     ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 8
     ; CHECK: [[GEP1:%[0-9]+]](p0) = G_GEP %0, [[OFFSET1]](s64)
     ; CHECK: G_STORE %6(s64), [[GEP1]](p0) :: (store 16 into %ir.addr)
diff --git a/test/CodeGen/AArch64/arm64-sincos.ll b/test/CodeGen/AArch64/arm64-sincos.ll
index 06157b2580c47..98876dbe87b03 100644
--- a/test/CodeGen/AArch64/arm64-sincos.ll
+++ b/test/CodeGen/AArch64/arm64-sincos.ll
@@ -1,7 +1,9 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s --check-prefix CHECK-IOS
 ; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix CHECK-LINUX
 
-; Combine sin / cos into a single call.
+; Combine sin / cos into a single call unless they may write errno (as
+; captured by readnone attrbiute, controlled by clang -fmath-errno
+; setting).
 ; rdar://12856873
 
 define float @test1(float %x) nounwind {
@@ -11,11 +13,26 @@ entry:
 ; CHECK-IOS: fadd s0, s0, s1
 
 ; CHECK-LINUX-LABEL: test1:
+; CHECK-LINUX: bl sincosf
+
+  %call = tail call float @sinf(float %x) readnone
+  %call1 = tail call float @cosf(float %x) readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define float @test1_errno(float %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test1_errno:
+; CHECK-IOS: bl _sinf
+; CHECK-IOS: bl _cosf
+
+; CHECK-LINUX-LABEL: test1_errno:
 ; CHECK-LINUX: bl sinf
 ; CHECK-LINUX: bl cosf
 
-  %call = tail call float @sinf(float %x) nounwind readnone
-  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %call = tail call float @sinf(float %x)
+  %call1 = tail call float @cosf(float %x)
   %add = fadd float %call, %call1
   ret float %add
 }
@@ -27,16 +44,31 @@ entry:
 ; CHECK-IOS: fadd d0, d0, d1
 
 ; CHECK-LINUX-LABEL: test2:
+; CHECK-LINUX: bl sincos
+
+  %call = tail call double @sin(double %x) readnone
+  %call1 = tail call double @cos(double %x) readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+define double @test2_errno(double %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test2_errno:
+; CHECK-IOS: bl _sin
+; CHECK-IOS: bl _cos
+
+; CHECK-LINUX-LABEL: test2_errno:
 ; CHECK-LINUX: bl sin
 ; CHECK-LINUX: bl cos
 
-  %call = tail call double @sin(double %x) nounwind readnone
-  %call1 = tail call double @cos(double %x) nounwind readnone
+  %call = tail call double @sin(double %x)
+  %call1 = tail call double @cos(double %x)
   %add = fadd double %call, %call1
   ret double %add
 }
 
-declare float  @sinf(float) readonly
-declare double @sin(double) readonly
-declare float @cosf(float) readonly
-declare double @cos(double) readonly
+declare float  @sinf(float)
+declare double @sin(double)
+declare float @cosf(float)
+declare double @cos(double)
diff --git a/test/CodeGen/AArch64/fast-isel-sp-adjust.ll b/test/CodeGen/AArch64/fast-isel-sp-adjust.ll
new file mode 100644
index 0000000000000..9201d1be6a9ce
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-sp-adjust.ll
@@ -0,0 +1,288 @@
+; RUN: llc -O0 -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
+; RUN: not llc -O0 -mtriple=aarch64-apple-ios -o /dev/null -fast-isel-abort=3 %s 2> %t
+; RUN: FileCheck %s --check-prefix=CHECK-ERRORS < %t
+
+; The issue here is that FastISel cannot emit an ADDrr where one of the inputs
+; is SP. This only ever crops up with function calls, and then only if the
+; argument is at an offset > 2^12 * size from SP.
+
+; If FastISel ever starts coping with this and emits an "add xD, sp, xM" it's
+; critical to check the encoding as well as the textual assembly. An ADDXrs with
+; SP as an operand will still print with SP, but will actually mean XZR.
+
+; CHECK-ERRORS: LLVM ERROR: FastISel missed call
+
+; CHECK-LABEL: foo:
+; CHECK-DAG: mov x[[SP:[0-9]+]], sp
+; CHECK-DAG: mov [[TMP:w[0-9]+]], #4104
+; CHECK: mov w[[OFFSET:[0-9]+]], [[TMP]]
+; CHECK: strb w0, [x[[SP]], x[[OFFSET]]]
+
+define void @foo(i8 %in) {
+  call void @bar(i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; All regs gone.
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 32
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 64
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 128
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 256
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 512
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 1024
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 2048
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef,
+                 i64 undef, i64 undef, i64 undef, i64 undef, ; sp + 4096
+                 i64 undef, ; sp + 4104 (i.e. not uimm12 or uimm12 << 12).
+                 i8 %in)
+  ret void
+}
+
+declare void @bar(i64, i64, i64, i64,
+                  i64, i64, i64, i64, ; All regs gone.
+                  i64, i64, i64, i64, ; sp + 32
+                  i64, i64, i64, i64, ; sp + 64
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64, ; sp + 128
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64, ; sp + 256
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64, ; sp + 512
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64, ; sp + 1024
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64, ; sp + 2048
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64,
+                  i64, i64, i64, i64, ; sp + 4096
+                  i64,
+                  i8)
diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll
index bd7c69c910c0e..8ee4dbcee52b6 100644
--- a/test/CodeGen/AArch64/misched-fusion-aes.ll
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@@ -1,7 +1,9 @@
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-aes,+crypto | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=generic -mattr=+crypto | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
 
 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
@@ -74,22 +76,23 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
   ret void
 
 ; CHECK-LABEL: aesea:
-; CHECKCORTEX: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKCORTEX: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKCORTEX: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKCORTEX: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKCORTEX: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKCORTEX: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKCORTEX: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKCORTEX: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKFUSEALLPAIRS: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKFUSEALLPAIRS: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKFUSEALLPAIRS: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKFUSEALLPAIRS: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKFUSEALLPAIRS: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKFUSEALLPAIRS: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKFUSEALLPAIRS: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKFUSEALLPAIRS: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKFUSEALLPAIRS-NOT: aesmc
 
 ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
@@ -175,22 +178,23 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
   ret void
 
 ; CHECK-LABEL: aesda:
-; CHECKCORTEX: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKCORTEX: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKCORTEX: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKCORTEX: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKCORTEX: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKCORTEX: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKCORTEX: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKCORTEX: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKFUSEALLPAIRS: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKFUSEALLPAIRS: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKFUSEALLPAIRS: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKFUSEALLPAIRS: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKFUSEALLPAIRS: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKFUSEALLPAIRS: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKFUSEALLPAIRS: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKFUSEALLPAIRS: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKFUSEALLPAIRS-NOT: aesimc
 
 ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
@@ -236,4 +240,5 @@ entry:
 ; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
 ; CHECK: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECK-NOT: aesmc
 }
diff --git a/test/CodeGen/AArch64/sincos-expansion.ll b/test/CodeGen/AArch64/sincos-expansion.ll
index c3a172dfb427e..41ee40378b4fc 100644
--- a/test/CodeGen/AArch64/sincos-expansion.ll
+++ b/test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,8 +1,18 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 define float @test_sincos_f32(float %f) {
+; CHECK-LABEL: test_sincos_f32:
   %sin = call float @sinf(float %f) readnone
   %cos = call float @cosf(float %f) readnone
+; CHECK: bl sincosf
+  %val = fadd float %sin, %cos
+  ret float %val
+}
+
+define float @test_sincos_f32_errno(float %f) {
+; CHECK-LABEL: test_sincos_f32_errno:
+  %sin = call float @sinf(float %f)
+  %cos = call float @cosf(float %f)
 ; CHECK: bl sinf
 ; CHECK: bl cosf
   %val = fadd float %sin, %cos
@@ -10,26 +20,46 @@ define float @test_sincos_f32(float %f) {
 }
 
 define double @test_sincos_f64(double %f) {
+; CHECK-LABEL: test_sincos_f64:
   %sin = call double @sin(double %f) readnone
   %cos = call double @cos(double %f) readnone
   %val = fadd double %sin, %cos
+; CHECK: bl sincos
+  ret double %val
+}
+
+define double @test_sincos_f64_errno(double %f) {
+; CHECK-LABEL: test_sincos_f64_errno:
+  %sin = call double @sin(double %f)
+  %cos = call double @cos(double %f)
+  %val = fadd double %sin, %cos
 ; CHECK: bl sin
 ; CHECK: bl cos
   ret double %val
 }
 
 define fp128 @test_sincos_f128(fp128 %f) {
+; CHECK-LABEL: test_sincos_f128:
   %sin = call fp128 @sinl(fp128 %f) readnone
   %cos = call fp128 @cosl(fp128 %f) readnone
   %val = fadd fp128 %sin, %cos
+; CHECK: bl sincosl
+  ret fp128 %val
+}
+
+define fp128 @test_sincos_f128_errno(fp128 %f) {
+; CHECK-LABEL: test_sincos_f128_errno:
+  %sin = call fp128 @sinl(fp128 %f)
+  %cos = call fp128 @cosl(fp128 %f)
+  %val = fadd fp128 %sin, %cos
 ; CHECK: bl sinl
 ; CHECK: bl cosl
   ret fp128 %val
 }
 
-declare float  @sinf(float) readonly
-declare double @sin(double) readonly
-declare fp128 @sinl(fp128) readonly
-declare float @cosf(float) readonly
-declare double @cos(double) readonly
-declare fp128 @cosl(fp128) readonly
+declare float  @sinf(float)
+declare double @sin(double)
+declare fp128 @sinl(fp128)
+declare float @cosf(float)
+declare double @cos(double)
+declare fp128 @cosl(fp128)
diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll
index 69bf3510cc5a7..bc28f477c8104 100644
--- a/test/CodeGen/AArch64/swifterror.ll
+++ b/test/CodeGen/AArch64/swifterror.ll
@@ -597,3 +597,30 @@ entry:
   tail call void @acallee(i8* null)
   ret void
 }
+
+declare swiftcc void @foo2(%swift_error** swifterror)
+
+; Make sure we properly assign registers during fast-isel.
+; CHECK-O0-LABEL: testAssign
+; CHECK-O0: mov     [[TMP:x.*]], xzr
+; CHECK-O0: mov     x21, [[TMP]]
+; CHECK-O0: bl      _foo2
+; CHECK-O0: str     x21, [s[[STK:.*]]]
+; CHECK-O0: ldr     x0, [s[[STK]]]
+
+; CHECK-APPLE-LABEL: testAssign
+; CHECK-APPLE: mov      x21, xzr
+; CHECK-APPLE: bl      _foo2
+; CHECK-APPLE: mov      x0, x21
+
+define swiftcc %swift_error* @testAssign(i8* %error_ref) {
+entry:
+  %error_ptr = alloca swifterror %swift_error*
+  store %swift_error* null, %swift_error** %error_ptr
+  call swiftcc void @foo2(%swift_error** swifterror %error_ptr)
+  br label %a
+
+a:
+  %error = load %swift_error*, %swift_error** %error_ptr
+  ret %swift_error* %error
+}
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index 2a3d3887ed69c..56a9e7022db9c 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -14,7 +14,7 @@ regBankSelected: true
 
 # GCN: global_addrspace
 # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
-# GCN: FLAT_LOAD_DWORD  [[PTR]], 0, 0
+# GCN: FLAT_LOAD_DWORD  [[PTR]], 0, 0, 0
 
 body: |
   bb.0:
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index 89be3bde94a8d..ea435725bf25d 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -15,7 +15,7 @@ regBankSelected: true
 # GCN: global_addrspace
 # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
 # GCN: [[VAL:%[0-9]+]] = COPY %vgpr2
-# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0
+# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0
 
 body: |
   bb.0:
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
new file mode 100644
index 0000000000000..f10c896a7af66
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
@@ -0,0 +1,22 @@
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  define void @test_add() { ret void }
+...
+
+---
+name:            test_add
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body: |
+  bb.0:
+    liveins: %vgpr0, %vgpr1
+    ; CHECK-LABEL: name: test_add
+    ; CHECK: %2(s32) = G_ADD %0, %1
+
+    %0(s32) = COPY %vgpr0
+    %1(s32) = COPY %vgpr1
+    %2(s32) = G_ADD %0, %1
+...
diff --git a/test/CodeGen/AMDGPU/always-uniform.ll b/test/CodeGen/AMDGPU/always-uniform.ll
new file mode 100644
index 0000000000000..4ba57fba81bc0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/always-uniform.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple amdgcn-amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+; GCN-LABEL: readfirstlane_uniform
+; GCN: 	s_load_dwordx2 s{{\[}}[[IN_ADDR:[0-9]+]]:1{{\]}}, s[4:5], 0x0
+; GCN:  v_readfirstlane_b32 s[[SCALAR:[0-9]+]], v0
+; GCN: 	s_add_u32 s[[LOAD_ADDR:[0-9]+]], s[[IN_ADDR]], s[[SCALAR]]
+; GCN:	s_load_dword s{{[0-9]+}}, s{{\[}}[[LOAD_ADDR]]
+
+define amdgpu_kernel void @readfirstlane_uniform(float addrspace(1)* noalias nocapture readonly, float addrspace(1)* noalias nocapture readonly) {
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %scalar = tail call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
+  %idx = zext i32 %scalar to i64
+  %gep0 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idx
+  %val = load float, float addrspace(1)* %gep0, align 4
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %1, i64 10
+  store float %val, float addrspace(1)* %gep1, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index cbdcf6aeaf429..5dec3e35ab3d0 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -1,12 +1,19 @@
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
-; OPT: getelementptr i32, i32 addrspace(4)* %in
-; OPT: br i1
-; OPT-NOT: ptrtoint
+; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in
+; OPT-CIVI: br i1
+; OPT-CIVI-NOT: ptrtoint
+
+; OPT-GFX9: br
+; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(4)* %0, i64 28
+; OPT-GFX9: %1 = bitcast i8 addrspace(4)* %sunkaddr to i32 addrspace(4)*
+; OPT-GFX9: load i32, i32 addrspace(4)* %1
 
 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
 ; GCN: flat_load_dword
@@ -96,3 +103,105 @@ endif:
 done:
   ret void
 }
+
+; OPT-LABEL: @test_sink_flat_small_max_flat_offset(
+; OPT-CIVI: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
+; OPT-CIVI: br
+; OPT-CIVI-NOT: getelementptr
+; OPT-CIVI: load i8, i8 addrspace(4)* %in.gep
+
+; OPT-GFX9: br
+; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(4)* %in, i64 4095
+; OPT-GFX9: load i8, i8 addrspace(4)* %sunkaddr
+
+; GCN-LABEL: {{^}}test_sink_flat_small_max_flat_offset:
+; GFX9: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
+; CIVI: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}}
+define amdgpu_kernel void @test_sink_flat_small_max_flat_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in) #1 {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024
+  %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(4)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_flat_small_max_plus_1_flat_offset(
+; OPT: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4096
+; OPT: br
+; OPT-NOT: getelementptr
+; OPT: load i8, i8 addrspace(4)* %in.gep
+
+; GCN-LABEL: {{^}}test_sink_flat_small_max_plus_1_flat_offset:
+; GCN: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}}
+define amdgpu_kernel void @test_sink_flat_small_max_plus_1_flat_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in) #1 {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 99999
+  %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4096
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(4)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_no_sink_flat_reg_offset(
+; OPT: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 %reg
+; OPT: br
+
+; OPT-NOT: getelementptr
+; OPT: load i8, i8 addrspace(4)* %in.gep
+
+; GCN-LABEL: {{^}}test_no_sink_flat_reg_offset:
+; GCN: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}}
+define amdgpu_kernel void @test_no_sink_flat_reg_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in, i64 %reg) #1 {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024
+  %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 %reg
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(4)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind argmemonly }
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
index 801029be8cb9f..0796c24b3317c 100644
--- a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
+++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
@@ -12,7 +12,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK:      DebugProps:
 ; CHECK:        DebuggerABIVersion:                [ 1, 0 ]
 ; CHECK:        ReservedNumVGPRs:                  4
-; CHECK:        ReservedFirstVGPR:                 11
+; GFX700:       ReservedFirstVGPR:                 11
+; GFX800:       ReservedFirstVGPR:                 11
+; GFX9:         ReservedFirstVGPR:                 14
 ; CHECK:        PrivateSegmentBufferSGPR:          0
 ; CHECK:        WavefrontPrivateSegmentOffsetSGPR: 11
 define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 {
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index bc992ed77ffdb..62b47beb12518 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -219,19 +219,19 @@ body:             |
     %34 = V_MOV_B32_e32 63, implicit %exec
 
     %27 = V_AND_B32_e64 %26, %24, implicit %exec
-    FLAT_STORE_DWORD %37, %27, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_AND_B32_e64 %24, %26, implicit %exec
-    FLAT_STORE_DWORD %37, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %29 = V_AND_B32_e32 %26, %24, implicit %exec
-    FLAT_STORE_DWORD %37, %29, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %30 = V_AND_B32_e64 %26, %26, implicit %exec
-    FLAT_STORE_DWORD %37, %30, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %31 = V_AND_B32_e64 %34, %34, implicit %exec
-    FLAT_STORE_DWORD %37, %31, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
@@ -407,34 +407,34 @@ body:             |
     %27 = S_MOV_B32 -4
 
     %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %13 = V_LSHL_B32_e64 %7, 12, implicit %exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %14 = V_LSHL_B32_e64 12, %7, implicit %exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %15 = V_LSHL_B32_e64 12, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %22 = V_LSHL_B32_e64 %6, 12, implicit %exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %23 = V_LSHL_B32_e64 %6, 32, implicit %exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %25 = V_LSHL_B32_e32 %6, %6, implicit %exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_LSHL_B32_e32 %27, %6, implicit %exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
@@ -615,34 +615,34 @@ body:             |
     %35 = V_MOV_B32_e32 2, implicit %exec
 
     %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %13 = V_ASHR_I32_e64 %7, 3, implicit %exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %14 = V_ASHR_I32_e64 7, %32, implicit %exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %15 = V_ASHR_I32_e64 %27, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %22 = V_ASHR_I32_e64 %6, 4, implicit %exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %23 = V_ASHR_I32_e64 %6, %33, implicit %exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %25 = V_ASHR_I32_e32 %34, %34, implicit %exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_ASHR_I32_e32 %27, %35, implicit %exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
@@ -824,34 +824,34 @@ body:             |
     %35 = V_MOV_B32_e32 2, implicit %exec
 
     %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %13 = V_LSHR_B32_e64 %7, 3, implicit %exec
-    FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %14 = V_LSHR_B32_e64 7, %32, implicit %exec
-    FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %15 = V_LSHR_B32_e64 %27, %24, implicit %exec
-    FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %22 = V_LSHR_B32_e64 %6, 4, implicit %exec
-    FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %23 = V_LSHR_B32_e64 %6, %33, implicit %exec
-    FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %25 = V_LSHR_B32_e32 %34, %34, implicit %exec
-    FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
-    FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     %28 = V_LSHR_B32_e32 %27, %35, implicit %exec
-    FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
 
     S_ENDPGM
 
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index c867e4fca2295..e486b9c71a54d 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -1,6 +1,7 @@
-; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck  %s
-; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck  %s
+; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s
+; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s
 ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA,GFX9 %s
 
 ; Disable optimizations in case there are optimizations added that
 ; specialize away generic pointer accesses.
@@ -172,6 +173,55 @@ define amdgpu_kernel void @flat_scratch_multidword_store() {
   ret void
 }
 
+; CHECK-LABEL: {{^}}store_flat_i8_max_offset:
+; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
+; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}}
+define amdgpu_kernel void @store_flat_i8_max_offset(i8 addrspace(4)* %fptr, i8 %x) #0 {
+  %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4095
+  store volatile i8 %x, i8 addrspace(4)* %fptr.offset
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1:
+; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8 addrspace(4)* %fptr, i8 %x) #0 {
+  %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4096
+  store volatile i8 %x, i8 addrspace(4)* %fptr.offset
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_i8_neg_offset:
+; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @store_flat_i8_neg_offset(i8 addrspace(4)* %fptr, i8 %x) #0 {
+  %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 -2
+  store volatile i8 %x, i8 addrspace(4)* %fptr.offset
+  ret void
+}
+
+; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
+; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
+define amdgpu_kernel void @load_flat_i8_max_offset(i8 addrspace(4)* %fptr) #0 {
+  %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4095
+  %val = load volatile i8, i8 addrspace(4)* %fptr.offset
+  ret void
+}
+
+; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
+; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8 addrspace(4)* %fptr) #0 {
+  %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4096
+  %val = load volatile i8, i8 addrspace(4)* %fptr.offset
+  ret void
+}
+
+; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
+; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define amdgpu_kernel void @load_flat_i8_neg_offset(i8 addrspace(4)* %fptr) #0 {
+  %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 -2
+  %val = load volatile i8, i8 addrspace(4)* %fptr.offset
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind convergent }
-attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/flat_atomics.ll b/test/CodeGen/AMDGPU/flat_atomics.ll
index cc95d80570e09..8e153181decb7 100644
--- a/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
-; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; CIVI: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GFX9: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -10,8 +12,28 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}atomic_add_i32_max_offset:
+; CIVI: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GFX9: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:4092{{$}}
+define amdgpu_kernel void @atomic_add_i32_max_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 1023
+  %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_max_offset_p1:
+; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @atomic_add_i32_max_offset_p1(i32 addrspace(4)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024
+  %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
-; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -22,7 +44,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
-; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -32,7 +55,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
-; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -82,7 +106,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
-; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -91,7 +116,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
-; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -102,7 +128,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
-; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -112,7 +139,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
-; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -162,7 +190,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
-; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -171,7 +200,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
-; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -182,7 +212,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
-; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -192,7 +223,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
-; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -242,7 +274,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
-; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -251,7 +284,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
-; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -262,7 +296,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
-; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -272,7 +307,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
-; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -322,7 +358,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
-; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -331,7 +368,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
-; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -342,7 +380,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
-; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -352,7 +391,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
-; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -402,7 +442,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
-; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -411,7 +452,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
-; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -422,7 +464,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
-; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -432,7 +475,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
-; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -482,7 +526,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_offset:
-; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -491,7 +536,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
-; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -502,7 +548,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
-; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -512,7 +559,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
-; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -562,7 +610,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_or_i32_offset:
-; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+; CIVI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -571,7 +620,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
-; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -582,7 +632,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
-; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+; CIVI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -592,7 +643,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
-; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -642,7 +694,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
-; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -651,7 +704,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
-; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -662,7 +716,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
-; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -672,7 +727,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
-; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -724,7 +780,8 @@ entry:
 ; CMP_SWAP
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
-; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; CIVI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX9: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -733,7 +790,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
-; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; CIVI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX9: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
@@ -745,7 +803,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
-; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; CIVI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GFX9: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -755,7 +814,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
-; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; CIVI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GFX9: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
@@ -808,7 +868,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_offset:
-; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; CIVI: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+; GFX9: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -817,7 +878,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
-; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
@@ -828,7 +890,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
-; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; CIVI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
@@ -838,7 +901,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
-; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; CIVI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GFX9: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
@@ -888,7 +952,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_load_i32_offset:
-; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; CIVI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX9: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
@@ -909,7 +974,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
-; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; CIVI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GFX9: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
@@ -932,7 +998,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_store_i32_offset:
-; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
@@ -949,7 +1016,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
-; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16 glc{{$}}
 define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
diff --git a/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/test/CodeGen/AMDGPU/global_smrd_cfg.ll
index a6a04151caa98..be6e3fd05ae73 100644
--- a/test/CodeGen/AMDGPU/global_smrd_cfg.ll
+++ b/test/CodeGen/AMDGPU/global_smrd_cfg.ll
@@ -72,6 +72,39 @@ bb22:                                             ; preds = %bb20, %bb11
   br i1 %tmp31, label %bb7, label %bb11
 }
 
+; one more test to ensure that aliasing store after the load
+; is considered clobbering if load parent block is the same 
+; as a loop header block.
+
+; CHECK-LABEL: %bb1
+
+; Load from %arg has alias store that is after the load 
+; but is considered clobbering because of the loop.
+
+; CHECK: flat_load_dword
+
+define amdgpu_kernel void @cfg_selfloop(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
+bb:
+  br label %bb1
+
+bb2:
+  ret void
+
+bb1:
+  %tmp13 = phi i32 [ %tmp25, %bb1 ], [ 0, %bb ]
+  %tmp14 = srem i32 %tmp13, %arg2
+  %tmp15 = sext i32 %tmp14 to i64
+  %tmp16 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp15
+  %tmp17 = load i32, i32 addrspace(1)* %tmp16, align 4, !tbaa !0
+  %tmp19 = sext i32 %tmp13 to i64
+  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp19
+  store i32 %tmp17, i32 addrspace(1)* %tmp21, align 4, !tbaa !0
+  %tmp25 = add nuw nsw i32 %tmp13, 1
+  %tmp31 = icmp eq i32 %tmp25, 100
+  br i1 %tmp31, label %bb2, label %bb1
+}
+
+
 attributes #0 = { "target-cpu"="fiji" }
 
 !0 = !{!1, !1, i64 0}
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir
index ff9fcd1c693f7..c6fe6debd225a 100644
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -246,15 +246,15 @@ body: |
     S_BRANCH %bb.1
 
   bb.1:
-    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr
+    FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
-    FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr
+    FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
     %vgpr3 = V_MOV_B32_e32 0, implicit %exec
     S_ENDPGM
 
diff --git a/test/CodeGen/AMDGPU/limit-coalesce.mir b/test/CodeGen/AMDGPU/limit-coalesce.mir
index 7d6d8a5891cd8..d6b3d7b14cd21 100644
--- a/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -57,15 +57,15 @@ body:             |
     %4.sub1 = COPY %3.sub0
     undef %5.sub0 = COPY %4.sub1
     %5.sub1 = COPY %4.sub0
-    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr
 
     %6 = IMPLICIT_DEF
     undef %7.sub0_sub1 = COPY %6
     %7.sub2 = COPY %3.sub0
-    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr
 
     %8 = IMPLICIT_DEF
     undef %9.sub0_sub1_sub2 = COPY %8
     %9.sub3 = COPY %3.sub0
-    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr
 ...
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-invalid-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-invalid-mac-operands.mir
index 1a0d68d81f976..31024277871d8 100644
--- a/test/CodeGen/AMDGPU/rename-independent-subregs-invalid-mac-operands.mir
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs-invalid-mac-operands.mir
@@ -58,12 +58,12 @@ body:             |
 
   bb.3:
     %1 = COPY killed %17
-    FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, 0, implicit %exec, implicit %flat_scr
     %14 = COPY %1.sub1
     %16 = COPY killed %1.sub0
     undef %15.sub0 = COPY killed %16
     %15.sub1 = COPY killed %14
-    FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, 0, implicit %exec, implicit %flat_scr
     S_ENDPGM
 
 ...
diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
index cd0d410368c7d..ba937c927c706 100644
--- a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
+++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
@@ -214,26 +214,26 @@ body:             |
     %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
     %16 = REG_SEQUENCE %14, 1, %15, 2
     %18 = COPY %16
-    %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
+    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
     %60 = V_BFE_U32 %17, 8, 8, implicit %exec
     %61 = V_LSHLREV_B32_e32 2, killed %60, implicit %exec
     %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
     %66 = COPY %13
     %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
     %67 = REG_SEQUENCE %70, 1, killed %65, 2
-    FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
+    FLAT_STORE_DWORD %67, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
     %37 = S_ADD_U32 %14, 4, implicit-def %scc
     %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
     %71 = COPY killed %37
     %72 = COPY killed %38
     %41 = REG_SEQUENCE killed %71, 1, killed %72, 2
-    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
+    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
     %73 = V_BFE_U32 %40, 8, 8, implicit %exec
     %74 = V_LSHLREV_B32_e32 2, killed %73, implicit %exec
     %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
     %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
     %80 = REG_SEQUENCE %83, 1, killed %78, 2
-    FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
+    FLAT_STORE_DWORD %80, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
     %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
     %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
     %57 = REG_SEQUENCE %55, 1, killed %56, 2
@@ -377,26 +377,26 @@ body:             |
     %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
     %16 = REG_SEQUENCE %14, 1, %15, 2
     %18 = COPY %16
-    %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
+    %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
     %60 = V_BFE_U32 %17, 8, 8, implicit %exec
     %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit %exec
     %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
     %66 = COPY %13
     %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
     %67 = REG_SEQUENCE %70, 1, killed %65, 2
-    FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
+    FLAT_STORE_DWORD %67, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
     %37 = S_ADD_U32 %14, 4, implicit-def %scc
     %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
     %71 = COPY killed %37
     %72 = COPY killed %38
     %41 = REG_SEQUENCE killed %71, 1, killed %72, 2
-    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
+    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
     %73 = V_BFE_U32 %40, 8, 8, implicit %exec
     %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit %exec
     %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
     %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
     %80 = REG_SEQUENCE %83, 1, killed %78, 2
-    FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
+    FLAT_STORE_DWORD %80, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
     %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
     %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
     %57 = REG_SEQUENCE %55, 1, killed %56, 2
diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir
index f754415dccb4b..38662e83b359d 100644
--- a/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/test/CodeGen/AMDGPU/waitcnt.mir
@@ -51,21 +51,21 @@ name: flat_zero_waitcnt
 body: |
   bb.0:
     successors: %bb.1
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4)
-    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4)
+    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_BRANCH %bb.1
 
   bb.1:
     successors: %bb.2
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
-    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_BRANCH %bb.2
 
   bb.2:
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4)
-    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16)
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4)
+    %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16)
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_ENDPGM
 ...
@@ -86,11 +86,11 @@ name: single_fallthrough_successor_no_end_block_wait
 body: |
   bb.0:
     successors: %bb.1
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
 
   bb.1:
     %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
-    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
     S_ENDPGM
 ...
 ---
@@ -114,15 +114,15 @@ name: single_branch_successor_not_next_block
 body: |
   bb.0:
     successors: %bb.2
-    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
    S_BRANCH %bb.2
 
   bb.1:
-    FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr
     S_ENDPGM
 
   bb.2:
      %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
-    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
     S_ENDPGM
 ...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 05902c22fb98a..6663a9210b870 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -621,28 +621,18 @@ define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) {
 ; CHECK: liveins: %r0, %r1
 ; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
 ; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
-; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF
-; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
-; CHECK: [[ARG_ARR2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
-; CHECK: [[ARG_ARR:%[0-9]+]](s64) = COPY [[ARG_ARR2]]
+; CHECK: [[ARG_ARR:%[0-9]+]](s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 32
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG_ARR]](s64)
 ; CHECK: %r0 = COPY [[R0]]
 ; CHECK: %r1 = COPY [[R1]]
 ; CHECK: BLX @tiny_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0, implicit-def %r1
 ; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
 ; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
 ; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2
-; CHECK: [[RES_ARR0:%[0-9]+]](s96) = IMPLICIT_DEF
-; CHECK: [[RES_ARR1:%[0-9]+]](s96) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0
-; CHECK: [[RES_ARR2:%[0-9]+]](s96) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32
-; CHECK: [[RES_ARR3:%[0-9]+]](s96) = G_INSERT [[RES_ARR2]], [[R2]](s32), 64
-; CHECK: [[RES_ARR:%[0-9]+]](s96) = COPY [[RES_ARR3]]
+; CHECK: [[RES_ARR:%[0-9]+]](s96) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32), [[R2]](s32)
 ; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 32
-; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 64
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32), [[R2:%[0-9]+]](s32) = G_UNMERGE_VALUES [[RES_ARR]](s96)
 ; FIXME: This doesn't seem correct with regard to the AAPCS docs (which say
 ; that composite types larger than 4 bytes should be passed through memory),
 ; but it's what DAGISel does. We should fix it in the common code for both.
@@ -664,19 +654,11 @@ define arm_aapcscc void @test_multiple_int_arrays([2 x i32] %arr0, [2 x i32] %ar
 ; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
 ; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2
 ; CHECK: [[R3:%[0-9]+]](s32) = COPY %r3
-; CHECK: [[ARG_ARR0_0:%[0-9]+]](s64) = IMPLICIT_DEF
-; CHECK: [[ARG_ARR0_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_0]], [[R0]](s32), 0
-; CHECK: [[ARG_ARR0_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_1]], [[R1]](s32), 32
-; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = COPY [[ARG_ARR0_2]]
-; CHECK: [[ARG_ARR1_0:%[0-9]+]](s64) = IMPLICIT_DEF
-; CHECK: [[ARG_ARR1_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_0]], [[R2]](s32), 0
-; CHECK: [[ARG_ARR1_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_1]], [[R3]](s32), 32
-; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = COPY [[ARG_ARR1_2]]
+; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
+; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = G_MERGE_VALUES [[R2]](s32), [[R3]](s32)
 ; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 32
-; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 0
-; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 32
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG_ARR0]](s64)
+; CHECK: [[R2:%[0-9]+]](s32), [[R3:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG_ARR1]](s64)
 ; CHECK: %r0 = COPY [[R0]]
 ; CHECK: %r1 = COPY [[R1]]
 ; CHECK: %r2 = COPY [[R2]]
@@ -707,21 +689,9 @@ define arm_aapcscc void @test_large_int_arrays([20 x i32] %arr) {
 ; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]]
 ; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]]
 ; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]]
-; CHECK: [[ARG_ARR0:%[0-9]+]](s640) = IMPLICIT_DEF
-; CHECK: [[ARG_ARR1:%[0-9]+]](s640) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
-; CHECK: [[ARG_ARR2:%[0-9]+]](s640) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
-; CHECK: [[ARG_ARR3:%[0-9]+]](s640) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64
-; CHECK: [[ARG_ARR4:%[0-9]+]](s640) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96
-; CHECK: [[ARG_ARR5:%[0-9]+]](s640) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128
-; CHECK: [[ARG_ARR6:%[0-9]+]](s640) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 608
-; CHECK: [[ARG_ARR:%[0-9]+]](s640) = COPY [[ARG_ARR6]]
+; CHECK: [[ARG_ARR:%[0-9]+]](s640) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32), [[R2]](s32), [[R3]](s32), [[FIRST_STACK_ELEMENT]](s32), {{.*}}, [[LAST_STACK_ELEMENT]](s32)
 ; CHECK: ADJCALLSTACKDOWN 64, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 32
-; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 64
-; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 96
-; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 128
-; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 608
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32), [[R2:%[0-9]+]](s32), [[R3:%[0-9]+]](s32), [[FIRST_STACK_ELEMENT:%[0-9]+]](s32), {{.*}}, [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG_ARR]](s640)
 ; CHECK: %r0 = COPY [[R0]]
 ; CHECK: %r1 = COPY [[R1]]
 ; CHECK: %r2 = COPY [[R2]]
@@ -761,15 +731,9 @@ define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) {
 ; BIG: [[ARR1:%[0-9]+]](s64) = G_MERGE_VALUES [[ARR1_1]](s32), [[ARR1_0]](s32)
 ; CHECK: [[ARR2_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[ARR2_ID]]
 ; CHECK: [[ARR2:%[0-9]+]](s64) = G_LOAD [[ARR2_FI]]{{.*}}load 8 from %fixed-stack.[[ARR2_ID]]
-; CHECK: [[ARR_MERGED_0:%[0-9]+]](s192) = IMPLICIT_DEF
-; CHECK: [[ARR_MERGED_1:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_0]], [[ARR0]](s64), 0
-; CHECK: [[ARR_MERGED_2:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_1]], [[ARR1]](s64), 64
-; CHECK: [[ARR_MERGED_3:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_2]], [[ARR2]](s64), 128
-; CHECK: [[ARR_MERGED:%[0-9]+]](s192) = COPY [[ARR_MERGED_3]]
+; CHECK: [[ARR_MERGED:%[0-9]+]](s192) = G_MERGE_VALUES [[ARR0]](s64), [[ARR1]](s64), [[ARR2]](s64)
 ; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[ARR0:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 0
-; CHECK: [[ARR1:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 64
-; CHECK: [[ARR2:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 128
+; CHECK: [[ARR0:%[0-9]+]](s64), [[ARR1:%[0-9]+]](s64), [[ARR2:%[0-9]+]](s64) = G_UNMERGE_VALUES [[ARR_MERGED]](s192)
 ; CHECK: [[ARR0_0:%[0-9]+]](s32), [[ARR0_1:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARR0]](s64)
 ; LITTLE: %r0 = COPY [[ARR0_0]](s32)
 ; LITTLE: %r1 = COPY [[ARR0_1]](s32)
@@ -787,13 +751,9 @@ define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) {
 ; CHECK: BLX @fp_arrays_aapcs_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
 ; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
 ; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
-; CHECK: [[R_MERGED_0:%[0-9]+]](s64) = IMPLICIT_DEF
-; CHECK: [[R_MERGED_1:%[0-9]+]](s64) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0
-; CHECK: [[R_MERGED_2:%[0-9]+]](s64) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32
-; CHECK: [[R_MERGED:%[0-9]+]](s64) = COPY [[R_MERGED_2]]
+; CHECK: [[R_MERGED:%[0-9]+]](s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
 ; CHECK: ADJCALLSTACKUP 8, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 32
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32) = G_UNMERGE_VALUES [[R_MERGED]](s64)
 ; CHECK: %r0 = COPY [[R0]]
 ; CHECK: %r1 = COPY [[R1]]
 ; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
@@ -826,33 +786,13 @@ define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3
 ; CHECK: [[Z2:%[0-9]+]](s64) = G_LOAD [[Z2_FI]]{{.*}}load 8
 ; CHECK: [[Z3_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z3_ID]]
 ; CHECK: [[Z3:%[0-9]+]](s64) = G_LOAD [[Z3_FI]]{{.*}}load 8
-; CHECK: [[X_ARR_0:%[0-9]+]](s192) = IMPLICIT_DEF
-; CHECK: [[X_ARR_1:%[0-9]+]](s192) = G_INSERT [[X_ARR_0]], [[X0]](s64), 0
-; CHECK: [[X_ARR_2:%[0-9]+]](s192) = G_INSERT [[X_ARR_1]], [[X1]](s64), 64
-; CHECK: [[X_ARR_3:%[0-9]+]](s192) = G_INSERT [[X_ARR_2]], [[X2]](s64), 128
-; CHECK: [[X_ARR:%[0-9]+]](s192) = COPY [[X_ARR_3]](s192)
-; CHECK: [[Y_ARR_0:%[0-9]+]](s96) = IMPLICIT_DEF
-; CHECK: [[Y_ARR_1:%[0-9]+]](s96) = G_INSERT [[Y_ARR_0]], [[Y0]](s32), 0
-; CHECK: [[Y_ARR_2:%[0-9]+]](s96) = G_INSERT [[Y_ARR_1]], [[Y1]](s32), 32
-; CHECK: [[Y_ARR_3:%[0-9]+]](s96) = G_INSERT [[Y_ARR_2]], [[Y2]](s32), 64
-; CHECK: [[Y_ARR:%[0-9]+]](s96) = COPY [[Y_ARR_3]](s96)
-; CHECK: [[Z_ARR_0:%[0-9]+]](s256) = IMPLICIT_DEF
-; CHECK: [[Z_ARR_1:%[0-9]+]](s256) = G_INSERT [[Z_ARR_0]], [[Z0]](s64), 0
-; CHECK: [[Z_ARR_2:%[0-9]+]](s256) = G_INSERT [[Z_ARR_1]], [[Z1]](s64), 64
-; CHECK: [[Z_ARR_3:%[0-9]+]](s256) = G_INSERT [[Z_ARR_2]], [[Z2]](s64), 128
-; CHECK: [[Z_ARR_4:%[0-9]+]](s256) = G_INSERT [[Z_ARR_3]], [[Z3]](s64), 192
-; CHECK: [[Z_ARR:%[0-9]+]](s256) = COPY [[Z_ARR_4]](s256)
+; CHECK: [[X_ARR:%[0-9]+]](s192) = G_MERGE_VALUES [[X0]](s64), [[X1]](s64), [[X2]](s64)
+; CHECK: [[Y_ARR:%[0-9]+]](s96) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32), [[Y2]](s32)
+; CHECK: [[Z_ARR:%[0-9]+]](s256) = G_MERGE_VALUES [[Z0]](s64), [[Z1]](s64), [[Z2]](s64), [[Z3]](s64)
 ; CHECK: ADJCALLSTACKDOWN 32, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[X0:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 0
-; CHECK: [[X1:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 64
-; CHECK: [[X2:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 128
-; CHECK: [[Y0:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 0
-; CHECK: [[Y1:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 32
-; CHECK: [[Y2:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 64
-; CHECK: [[Z0:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 0
-; CHECK: [[Z1:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 64
-; CHECK: [[Z2:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 128
-; CHECK: [[Z3:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 192
+; CHECK: [[X0:%[0-9]+]](s64), [[X1:%[0-9]+]](s64), [[X2:%[0-9]+]](s64) = G_UNMERGE_VALUES [[X_ARR]](s192)
+; CHECK: [[Y0:%[0-9]+]](s32), [[Y1:%[0-9]+]](s32), [[Y2:%[0-9]+]](s32) = G_UNMERGE_VALUES [[Y_ARR]](s96)
+; CHECK: [[Z0:%[0-9]+]](s64), [[Z1:%[0-9]+]](s64), [[Z2:%[0-9]+]](s64), [[Z3:%[0-9]+]](s64) = G_UNMERGE_VALUES [[Z_ARR]](s256)
 ; CHECK: %d0 = COPY [[X0]](s64)
 ; CHECK: %d1 = COPY [[X1]](s64)
 ; CHECK: %d2 = COPY [[X2]](s64)
@@ -880,17 +820,9 @@ define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3
 ; CHECK: [[R1:%[0-9]+]](s32) = COPY %s1
 ; CHECK: [[R2:%[0-9]+]](s32) = COPY %s2
 ; CHECK: [[R3:%[0-9]+]](s32) = COPY %s3
-; CHECK: [[R_MERGED_0:%[0-9]+]](s128) = IMPLICIT_DEF
-; CHECK: [[R_MERGED_1:%[0-9]+]](s128) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0
-; CHECK: [[R_MERGED_2:%[0-9]+]](s128) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32
-; CHECK: [[R_MERGED_3:%[0-9]+]](s128) = G_INSERT [[R_MERGED_2]], [[R2]](s32), 64
-; CHECK: [[R_MERGED_4:%[0-9]+]](s128) = G_INSERT [[R_MERGED_3]], [[R3]](s32), 96
-; CHECK: [[R_MERGED:%[0-9]+]](s128) = COPY [[R_MERGED_4]]
+; CHECK: [[R_MERGED:%[0-9]+]](s128) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32), [[R2]](s32), [[R3]](s32)
 ; CHECK: ADJCALLSTACKUP 32, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 32
-; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 64
-; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 96
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32), [[R2:%[0-9]+]](s32), [[R3:%[0-9]+]](s32) = G_UNMERGE_VALUES [[R_MERGED]](s128)
 ; CHECK: %s0 = COPY [[R0]]
 ; CHECK: %s1 = COPY [[R1]]
 ; CHECK: %s2 = COPY [[R2]]
@@ -919,21 +851,9 @@ define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) {
 ; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]]
 ; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]]
 ; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]]
-; CHECK: [[ARG_ARR0:%[0-9]+]](s768) = IMPLICIT_DEF
-; CHECK: [[ARG_ARR1:%[0-9]+]](s768) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
-; CHECK: [[ARG_ARR2:%[0-9]+]](s768) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
-; CHECK: [[ARG_ARR3:%[0-9]+]](s768) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64
-; CHECK: [[ARG_ARR4:%[0-9]+]](s768) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96
-; CHECK: [[ARG_ARR5:%[0-9]+]](s768) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128
-; CHECK: [[ARG_ARR6:%[0-9]+]](s768) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 736
-; CHECK: [[ARG_ARR:%[0-9]+]](s768) = COPY [[ARG_ARR6]]
+; CHECK: [[ARG_ARR:%[0-9]+]](s768) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32), [[R2]](s32), [[R3]](s32), [[FIRST_STACK_ELEMENT]](s32), {{.*}}, [[LAST_STACK_ELEMENT]](s32)
 ; CHECK: ADJCALLSTACKDOWN 80, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 32
-; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 64
-; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 96
-; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 128
-; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 736
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32), [[R2:%[0-9]+]](s32), [[R3:%[0-9]+]](s32), [[FIRST_STACK_ELEMENT:%[0-9]+]](s32), {{.*}}, [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG_ARR]](s768)
 ; CHECK: %r0 = COPY [[R0]]
 ; CHECK: %r1 = COPY [[R1]]
 ; CHECK: %r2 = COPY [[R2]]
@@ -951,13 +871,9 @@ define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) {
 ; CHECK: BLX @tough_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
 ; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
 ; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
-; CHECK: [[RES_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF
-; CHECK: [[RES_ARR1:%[0-9]+]](s64) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0
-; CHECK: [[RES_ARR2:%[0-9]+]](s64) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32
-; CHECK: [[RES_ARR:%[0-9]+]](s64) = COPY [[RES_ARR2]]
+; CHECK: [[RES_ARR:%[0-9]+]](s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
 ; CHECK: ADJCALLSTACKUP 80, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 32
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32) = G_UNMERGE_VALUES [[RES_ARR]](s64)
 ; CHECK: %r0 = COPY [[R0]]
 ; CHECK: %r1 = COPY [[R1]]
 ; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
@@ -966,65 +882,28 @@ entry:
   ret [2 x i32*] %r
 }
 
-declare arm_aapcscc {i32, i32} @structs_target({i32, i32}, {i32*, float, i32, double})
+declare arm_aapcscc {i32, i32} @structs_target({i32, i32})
 
-define arm_aapcscc {i32, i32} @test_structs({i32, i32} %x, {i32*, float, i32, double} %y) {
+define arm_aapcscc {i32, i32} @test_structs({i32, i32} %x) {
 ; CHECK-LABEL: test_structs
-; CHECK: fixedStack:
-; CHECK-DAG: id: [[Y2_ID:[0-9]+]], type: default, offset: 0, size: 4,
-; CHECK-DAG: id: [[Y3_ID:[0-9]+]], type: default, offset: 8, size: 8,
-; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK: liveins: %r0, %r1
 ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
 ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
-; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
-; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
-; CHECK: [[Y2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Y2_ID]]
-; CHECK: [[Y2:%[0-9]+]](s32) = G_LOAD [[Y2_ADDR]](p0){{.*}}load 4
-; CHECK: [[Y3_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Y3_ID]]
-; CHECK: [[Y3:%[0-9]+]](s64) = G_LOAD [[Y3_ADDR]](p0){{.*}}load 8
-; CHECK: [[X_0:%[0-9]+]](s64) = IMPLICIT_DEF
-; CHECK: [[X_1:%[0-9]+]](s64) = G_INSERT [[X_0]], [[X0]](s32), 0
-; CHECK: [[X_2:%[0-9]+]](s64) = G_INSERT [[X_1]], [[X1]](s32), 32
-; CHECK: [[X:%[0-9]+]](s64) = COPY [[X_2]]
-; CHECK: [[Y_0:%[0-9]+]](s192) = IMPLICIT_DEF
-; CHECK: [[Y_1:%[0-9]+]](s192) = G_INSERT [[Y_0]], [[Y0]](s32), 0
-; CHECK: [[Y_2:%[0-9]+]](s192) = G_INSERT [[Y_1]], [[Y1]](s32), 32
-; CHECK: [[Y_3:%[0-9]+]](s192) = G_INSERT [[Y_2]], [[Y2]](s32), 64
-; CHECK: [[Y_4:%[0-9]+]](s192) = G_INSERT [[Y_3]], [[Y3]](s64), 128
-; CHECK: [[Y:%[0-9]+]](s192) = COPY [[Y_4]]
-; CHECK: ADJCALLSTACKDOWN 16, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[X0:%[0-9]+]](s32) = G_EXTRACT [[X]](s64), 0
-; CHECK: [[X1:%[0-9]+]](s32) = G_EXTRACT [[X]](s64), 32
-; CHECK: [[Y0:%[0-9]+]](s32) = G_EXTRACT [[Y]](s192), 0
-; CHECK: [[Y1:%[0-9]+]](s32) = G_EXTRACT [[Y]](s192), 32
-; CHECK: [[Y2:%[0-9]+]](s32) = G_EXTRACT [[Y]](s192), 64
-; CHECK: [[Y3:%[0-9]+]](s64) = G_EXTRACT [[Y]](s192), 128
+; CHECK: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32)
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[X0:%[0-9]+]](s32), [[X1:%[0-9]+]](s32) = G_UNMERGE_VALUES [[X]](s64)
 ; CHECK-DAG: %r0 = COPY [[X0]](s32)
 ; CHECK-DAG: %r1 = COPY [[X1]](s32)
-; CHECK-DAG: %r2 = COPY [[Y0]](s32)
-; CHECK-DAG: %r3 = COPY [[Y1]](s32)
-; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
-; CHECK: [[Y2_OFF:%[0-9]+]](s32) = G_CONSTANT i32 0
-; CHECK: [[Y2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Y2_OFF]](s32)
-; CHECK: G_STORE [[Y2]](s32), [[Y2_ADDR]](p0){{.*}}store 4
-; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
-; CHECK: [[Y3_OFF:%[0-9]+]](s32) = G_CONSTANT i32 8
-; CHECK: [[Y3_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Y3_OFF]](s32)
-; CHECK: G_STORE [[Y3]](s64), [[Y3_ADDR]](p0){{.*}}store 8
-; CHECK: BLX @structs_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+; CHECK: BLX @structs_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0, implicit-def %r1
 ; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
 ; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
-; CHECK: [[R_0:%[0-9]+]](s64) = IMPLICIT_DEF
-; CHECK: [[R_1:%[0-9]+]](s64) = G_INSERT [[R_0]], [[R0]](s32), 0
-; CHECK: [[R_2:%[0-9]+]](s64) = G_INSERT [[R_1]], [[R1]](s32), 32
-; CHECK: [[R:%[0-9]+]](s64) = COPY [[R_2]]
-; CHECK: ADJCALLSTACKUP 16, 0, 14, _, implicit-def %sp, implicit %sp
-; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R]](s64), 0
-; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R]](s64), 32
+; CHECK: [[R:%[0-9]+]](s64) = G_MERGE_VALUES [[R0]](s32), [[R1]](s32)
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32), [[R1:%[0-9]+]](s32) = G_UNMERGE_VALUES [[R]](s64)
 ; CHECK: %r0 = COPY [[R0]](s32)
 ; CHECK: %r1 = COPY [[R1]](s32)
 ; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
-  %r = notail call arm_aapcscc {i32, i32} @structs_target({i32, i32} %x, {i32*, float, i32, double} %y)
+  %r = notail call arm_aapcscc {i32, i32} @structs_target({i32, i32} %x)
   ret {i32, i32} %r
 }
 
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
index 2881740b016fd..c778caacd0f45 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
@@ -66,3 +66,24 @@ define arm_aapcscc i8 @test_udiv_i8(i8 %a, i8 %b) {
   ret i8 %r
 }
 
+define arm_aapcscc i32 @test_srem_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_srem_i32:
+; HWDIV: sdiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_idivmod
+; SOFT-DEFAULT: blx __modsi3
+  %r = srem i32 %x, %y
+  ret i32 %r
+}
+
+define arm_aapcscc i32 @test_urem_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_urem_i32:
+; HWDIV: udiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_uidivmod
+; SOFT-DEFAULT: blx __umodsi3
+  %r = urem i32 %x, %y
+  ret i32 %r
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
index 6f3e09d328cfe..c93e7fa0ec560 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
@@ -11,6 +11,9 @@
 
   define void @test_sdiv_i8() { ret void }
   define void @test_udiv_i8() { ret void }
+
+  define void @test_srem_i32() { ret void }
+  define void @test_urem_i32() { ret void }
 ...
 ---
 name:            test_sdiv_i32
@@ -228,3 +231,75 @@ body:             |
     %r0 = COPY %2(s8)
     BX_RET 14, _, implicit %r0
 ...
+---
+name:            test_srem_i32
+# CHECK-LABEL: name: test_srem_i32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    ; HWDIV: [[Q:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]]
+    ; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]]
+    ; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]]
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X]]
+    ; SOFT-DAG: %r1 = COPY [[Y]]
+    ; SOFT-AEABI: BLX $__aeabi_idivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0, implicit-def %r1
+    ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    %2(s32) = G_SREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_urem_i32
+# CHECK-LABEL: name: test_urem_i32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    ; HWDIV: [[Q:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]]
+    ; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]]
+    ; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]]
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X]]
+    ; SOFT-DAG: %r1 = COPY [[Y]]
+    ; SOFT-AEABI: BLX $__aeabi_uidivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0, implicit-def %r1
+    ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    %2(s32) = G_UREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s32)
+    BX_RET 14, _, implicit %r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
index 34f00aebe1be5..f2f9c5d2a81df 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
@@ -65,6 +65,14 @@ define %large.struct @test_large_struct_return() {
   ret %large.struct %r
 }
 
+%mixed.struct = type {i32*, float, i32}
+
+define %mixed.struct @test_mixed_struct(%mixed.struct %x) {
+; CHECK: remark: {{.*}} unable to lower arguments: %mixed.struct (%mixed.struct)*
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_mixed_struct
+  ret %mixed.struct %x
+}
+
 define void @test_vararg_definition(i32 %a, ...) {
 ; CHECK: remark: {{.*}} unable to lower arguments: void (i32, ...)*
 ; CHECK-LABEL: warning: Instruction selection used fallback path for test_vararg_definition
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
index 5f914323861ab..e234e179ed071 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
@@ -156,3 +156,41 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
   %sub2 = fsub <2 x float> %sub1, %mul3
   ret <2 x float> %sub2
 }
+
+define float @Test5(float %f1, float %f2, float %f3) {
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       Test5:BB#0
+
+; CHECK-DEFAULT: VNMLS
+; CHECK-FAST:    VFNMS
+; CHECK:       Latency            : 9
+; CHECK:       Successors:
+; CHECK:       data
+; > VMLAS not-optimized latency to VMOVRS = 9
+; CHECK-SAME:  Latency=9
+
+; f1 * f2 - f3  ==>  VNMLS/VFNMS
+  %mul = fmul float %f1, %f2
+  %sub = fsub float %mul, %f3
+  ret float %sub
+}
+
+
+define float @Test6(float %f1, float %f2, float %f3) {
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       Test6:BB#0
+
+; CHECK-DEFAULT: VNMLA
+; CHECK-FAST:    VFNMA
+; CHECK:       Latency            : 9
+; CHECK:       Successors:
+; CHECK:       data
+; > VMLAS not-optimized latency to VMOVRS = 9
+; CHECK-SAME:  Latency=9
+
+; f1 * f2 - f3  ==>  VNMLA/VFNMA
+  %mul = fmul float %f1, %f2
+  %sub1 = fsub float -0.0, %mul
+  %sub2 = fsub float %sub1, %f2
+  ret float %sub2
+}
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index 1e9d890e93337..6019a9410b033 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -273,6 +273,6 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !160 = !DIFile(filename: "header.h", directory: "/Volumes/Sandbox/llvm")
 !161 = !{!"header2.h", !"/Volumes/Sandbox/llvm"}
 !162 = !{i32 1, !"Debug Info Version", i32 3}
-!163 = !DIExpression(DW_OP_plus, 20, DW_OP_deref, DW_OP_plus, 4, DW_OP_deref, DW_OP_plus, 24)
-!164 = !DIExpression(DW_OP_deref, DW_OP_plus, 24)
-!165 = !DIExpression(DW_OP_deref, DW_OP_plus, 28)
+!163 = !DIExpression(DW_OP_plus_uconst, 20, DW_OP_deref, DW_OP_plus_uconst, 4, DW_OP_deref, DW_OP_plus_uconst, 24)
+!164 = !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 24)
+!165 = !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 28)
diff --git a/test/CodeGen/ARM/sincos.ll b/test/CodeGen/ARM/sincos.ll
index 5be0044ddbd35..42a834d24b3e1 100644
--- a/test/CodeGen/ARM/sincos.ll
+++ b/test/CodeGen/ARM/sincos.ll
@@ -1,10 +1,12 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT
 ; RUN: llc < %s -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS
-; RUN: llc < %s -mtriple=armv7-linux-gnu -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT-GNU
+; RUN: llc < %s -mtriple=armv7-linux-gnu -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 \
 ; RUN:   --enable-unsafe-fp-math | FileCheck %s --check-prefix=SINCOS-GNU
 
-; Combine sin / cos into a single call.
+; Combine sin / cos into a single call unless they may write errno (as
+; captured by readnone attrbiute, controlled by clang -fmath-errno
+; setting).
 ; rdar://12856873
 
 define float @test1(float %x) nounwind {
@@ -19,12 +21,28 @@ entry:
 ; NOOPT: bl _sinf
 ; NOOPT: bl _cosf
 
-; NOOPT-GNU-LABEL: test1:
-; NOOPT-GNU: bl sinf
-; NOOPT-GNU: bl cosf
+  %call = tail call float @sinf(float %x) readnone
+  %call1 = tail call float @cosf(float %x) readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define float @test1_errno(float %x) nounwind {
+entry:
+; SINCOS-LABEL: test1_errno:
+; SINCOS: bl _sinf
+; SINCOS: bl _cosf
 
-  %call = tail call float @sinf(float %x) nounwind readnone
-  %call1 = tail call float @cosf(float %x) nounwind readnone
+; SINCOS-GNU-LABEL: test1_errno:
+; SINCOS-GNU: bl sinf
+; SINCOS-GNU: bl cosf
+
+; NOOPT-LABEL: test1_errno:
+; NOOPT: bl _sinf
+; NOOPT: bl _cosf
+
+  %call = tail call float @sinf(float %x)
+  %call1 = tail call float @cosf(float %x)
   %add = fadd float %call, %call1
   ret float %add
 }
@@ -41,16 +59,33 @@ entry:
 ; NOOPT: bl _sin
 ; NOOPT: bl _cos
 
-; NOOPT-GNU-LABEL: test2:
-; NOOPT-GNU: bl sin
-; NOOPT-GNU: bl cos
-  %call = tail call double @sin(double %x) nounwind readnone
-  %call1 = tail call double @cos(double %x) nounwind readnone
+  %call = tail call double @sin(double %x) readnone
+  %call1 = tail call double @cos(double %x) readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+define double @test2_errno(double %x) nounwind {
+entry:
+; SINCOS-LABEL: test2_errno:
+; SINCOS: bl _sin
+; SINCOS: bl _cos
+
+; SINCOS-GNU-LABEL: test2_errno:
+; SINCOS-GNU: bl sin
+; SINCOS-GNU: bl cos
+
+; NOOPT-LABEL: test2_errno:
+; NOOPT: bl _sin
+; NOOPT: bl _cos
+
+  %call = tail call double @sin(double %x)
+  %call1 = tail call double @cos(double %x)
   %add = fadd double %call, %call1
   ret double %add
 }
 
-declare float  @sinf(float) readonly
-declare double @sin(double) readonly
-declare float @cosf(float) readonly
-declare double @cos(double) readonly
+declare float  @sinf(float)
+declare double @sin(double)
+declare float @cosf(float)
+declare double @cos(double)
diff --git a/test/CodeGen/ARM/swifterror.ll b/test/CodeGen/ARM/swifterror.ll
index 78764202f6273..3fd57c592bfb6 100644
--- a/test/CodeGen/ARM/swifterror.ll
+++ b/test/CodeGen/ARM/swifterror.ll
@@ -528,3 +528,31 @@ entry:
   tail call void @acallee(i8* null)
   ret void
 }
+
+
+declare swiftcc void @foo2(%swift_error** swifterror)
+
+; Make sure we properly assign registers during fast-isel.
+; CHECK-O0-LABEL: testAssign
+; CHECK-O0: mov     r8, #0
+; CHECK-O0: bl      _foo2
+; CHECK-O0: str     r8, [s[[STK:p.*]]]
+; CHECK-O0: ldr     r0, [s[[STK]]]
+; CHECK-O0: pop
+
+; CHECK-APPLE-LABEL: testAssign
+; CHECK-APPLE:  mov     r8, #0
+; CHECK-APPLE:  bl      _foo2
+; CHECK-APPLE:  mov     r0, r8
+
+define swiftcc %swift_error* @testAssign(i8* %error_ref) {
+entry:
+  %error_ptr = alloca swifterror %swift_error*
+  store %swift_error* null, %swift_error** %error_ptr
+  call swiftcc void @foo2(%swift_error** swifterror %error_ptr)
+  br label %a
+
+a:
+  %error = load %swift_error*, %swift_error** %error_ptr
+  ret %swift_error* %error
+}
diff --git a/test/CodeGen/BPF/rodata_1.ll b/test/CodeGen/BPF/rodata_1.ll
new file mode 100644
index 0000000000000..5566f76bb75c5
--- /dev/null
+++ b/test/CodeGen/BPF/rodata_1.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s -march=bpfel -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=bpfeb -verify-machineinstrs | FileCheck %s
+
+; Source code:
+; struct test_t1 {
+;   char a, b, c;
+; };
+; struct test_t2 {
+;   int a, b, c, d, e;
+; };
+;
+; struct test_t1 g1;
+; struct test_t2 g2;
+; int test()
+; {
+;   struct test_t1 t1 = {.c = 1};
+;   struct test_t2 t2 = {.c = 1};
+;   g1 = t1;
+;   g2 = t2;
+;   return 0;
+; }
+
+%struct.test_t1 = type { i8, i8, i8 }
+%struct.test_t2 = type { i32, i32, i32, i32, i32 }
+
+@test.t1 = private unnamed_addr constant %struct.test_t1 { i8 0, i8 0, i8 1 }, align 1
+@test.t2 = private unnamed_addr constant %struct.test_t2 { i32 0, i32 0, i32 1, i32 0, i32 0 }, align 4
+@g1 = common local_unnamed_addr global %struct.test_t1 zeroinitializer, align 1
+@g2 = common local_unnamed_addr global %struct.test_t2 zeroinitializer, align 4
+
+; Function Attrs: nounwind
+define i32 @test() local_unnamed_addr #0 {
+; CHECK-LABEL: test:
+
+entry:
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds (%struct.test_t1, %struct.test_t1* @g1, i64 0, i32 0), i8* getelementptr inbounds (%struct.test_t1, %struct.test_t1* @test.t1, i64 0, i32 0), i64 3, i32 1, i1 false)
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.test_t2* @g2 to i8*), i8* bitcast (%struct.test_t2* @test.t2 to i8*), i64 20, i32 4, i1 false)
+; CHECK:  r1 = <MCOperand Expr:(g1)>ll
+; CHECK:  r2 = 0
+; CHECK:  *(u8 *)(r1 + 1) = r2
+; CHECK:  r3 = 1
+; CHECK:  *(u8 *)(r1 + 2) = r3
+; CHECK:  r1 = <MCOperand Expr:(g2)>ll
+; CHECK:  *(u32 *)(r1 + 8) = r3
+    ret i32 0
+}
+; CHECK: .section  .rodata,"a",@progbits
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/BPF/rodata_2.ll b/test/CodeGen/BPF/rodata_2.ll
new file mode 100644
index 0000000000000..74b3c3640c3f2
--- /dev/null
+++ b/test/CodeGen/BPF/rodata_2.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -march=bpfel -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=bpfeb -verify-machineinstrs | FileCheck %s
+
+; Source code:
+; struct test_t1 {
+;   char a;
+;   int  b;
+; };
+; struct test_t2 {
+;   char a, b;
+;   struct test_t1 c[2];
+;   int d[2];
+;   int e;
+; };
+; struct test_t2 g;
+; int test()
+; {
+;    struct test_t2 t2 = {.c = {{}, {.b = 1}}, .d = {2, 3}};
+;    g = t2;
+;    return 0;
+; }
+
+%struct.test_t2 = type { i8, i8, [2 x %struct.test_t1], [2 x i32], i32 }
+%struct.test_t1 = type { i8, i32 }
+
+@test.t2 = private unnamed_addr constant %struct.test_t2 { i8 0, i8 0, [2 x %struct.test_t1] [%struct.test_t1 zeroinitializer, %struct.test_t1 { i8 0, i32 1 }], [2 x i32] [i32 2, i32 3], i32 0 }, align 4
+@g = common local_unnamed_addr global %struct.test_t2 zeroinitializer, align 4
+
+; Function Attrs: nounwind
+define i32 @test() local_unnamed_addr #0 {
+; CHECK-LABEL: test:
+
+entry:
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds (%struct.test_t2, %struct.test_t2* @g, i64 0, i32 0), i8* getelementptr inbounds (%struct.test_t2, %struct.test_t2* @test.t2, i64 0, i32 0), i64 32, i32 4, i1 false)
+; CHECK:  r1 = <MCOperand Expr:(g)>ll
+; CHECK:  r2 = 0
+; CHECK:  *(u32 *)(r1 + 28) = r2
+; CHECK:  r3 = 3
+; CHECK:  *(u32 *)(r1 + 24) = r3
+; CHECK:  r3 = 2
+; CHECK:  *(u32 *)(r1 + 20) = r3
+; CHECK:  r3 = 1
+; CHECK:  *(u32 *)(r1 + 16) = r3
+      ret i32 0
+}
+; CHECK: .section  .rodata.cst32,"aM",@progbits,32
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/BPF/rodata_3.ll b/test/CodeGen/BPF/rodata_3.ll
new file mode 100644
index 0000000000000..814ce7645465b
--- /dev/null
+++ b/test/CodeGen/BPF/rodata_3.ll
@@ -0,0 +1,41 @@
+; REQUIRES: x86_64-linux
+; RUN: llc < %s -march=bpfel -verify-machineinstrs | FileCheck --check-prefix=CHECK-EL %s
+; RUN: llc < %s -march=bpfeb -verify-machineinstrs | FileCheck --check-prefix=CHECK-EB %s
+;
+; This test requires little-endian host, so we specific x86_64-linux here.
+; Source code:
+; struct test_t1 {
+;   char a;
+;   int b, c, d;
+; };
+;
+; struct test_t1 g;
+; int test()
+; {
+;   struct test_t1 t1 = {.a = 1};
+;   g = t1;
+;   return 0;
+; }
+
+%struct.test_t1 = type { i8, i32, i32, i32 }
+
+@test.t1 = private unnamed_addr constant %struct.test_t1 { i8 1, i32 0, i32 0, i32 0 }, align 4
+@g = common local_unnamed_addr global %struct.test_t1 zeroinitializer, align 4
+
+; Function Attrs: nounwind
+define i32 @test() local_unnamed_addr #0 {
+entry:
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds (%struct.test_t1, %struct.test_t1* @g, i64 0, i32 0), i8* getelementptr inbounds (%struct.test_t1, %struct.test_t1* @test.t1, i64 0, i32 0), i64 16, i32 4, i1 false)
+; CHECK-EL:  r2 = 1
+; CHECK-EL:  *(u32 *)(r1 + 0) = r2
+; CHECK-EB:  r2 = 16777216
+; CHECK-EB:  *(u32 *)(r1 + 0) = r2
+    ret i32 0
+}
+; CHECK-EL:  .section .rodata.cst16,"aM",@progbits,16
+; CHECK-EB:  .section .rodata.cst16,"aM",@progbits,16
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/BPF/rodata_4.ll b/test/CodeGen/BPF/rodata_4.ll
new file mode 100644
index 0000000000000..d6b9fba5be0a7
--- /dev/null
+++ b/test/CodeGen/BPF/rodata_4.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=bpfel -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=bpfeb -verify-machineinstrs | FileCheck %s
+
+; Source code:
+; struct test_t1
+; {
+;   short a;
+;   short b;
+;   char c;
+; };
+;
+; struct test_t1 g;
+; int test()
+; {
+;   struct test_t1 t1[] = {{50, 500, 5}, {60, 600, 6}, {70, 700, 7}, {80, 800, 8} };
+;
+;   g = t1[1];
+;   return 0;
+; }
+
+%struct.test_t1 = type { i16, i16, i8 }
+
+@test.t1 = private unnamed_addr constant [4 x %struct.test_t1] [%struct.test_t1 { i16 50, i16 500, i8 5 }, %struct.test_t1 { i16 60, i16 600, i8 6 }, %struct.test_t1 { i16 70, i16 700, i8 7 }, %struct.test_t1 { i16 80, i16 800, i8 8 }], align 2
+@g = common local_unnamed_addr global %struct.test_t1 zeroinitializer, align 2
+
+; Function Attrs: nounwind
+define i32 @test() local_unnamed_addr #0 {
+; CHECK-LABEL: test:
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.test_t1* @g to i8*), i8* bitcast (%struct.test_t1* getelementptr inbounds ([4 x %struct.test_t1], [4 x %struct.test_t1]* @test.t1, i64 0, i64 1) to i8*), i64 6, i32 2, i1 false)
+; CHECK:  r2 = 600
+; CHECK:  *(u16 *)(r1 + 2) = r2
+; CHECK:  r2 = 60
+; CHECK:  *(u16 *)(r1 + 0) = r2
+  ret i32 0
+}
+; CHECK  .section  .rodata,"a",@progbits
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/Hexagon/loop-idiom/pmpy-shiftconv-fail.ll b/test/CodeGen/Hexagon/loop-idiom/pmpy-shiftconv-fail.ll
new file mode 100644
index 0000000000000..0abf8f8732009
--- /dev/null
+++ b/test/CodeGen/Hexagon/loop-idiom/pmpy-shiftconv-fail.ll
@@ -0,0 +1,48 @@
+; RUN: opt -march=hexagon -hexagon-loop-idiom -S < %s | FileCheck %s
+; REQUIRES: asserts
+;
+; Check for sane output, this used to crash.
+; CHECK: define void @fred
+
+; The conversion of shifts from right to left failed, but the return
+; code was not checked and the transformation proceeded.
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+@A = common global [256 x i32] zeroinitializer, align 8
+
+; Function Attrs: noinline nounwind
+define void @fred() local_unnamed_addr #0 {
+b0:
+  br label %b1
+
+b1:                                               ; preds = %b13, %b0
+  %v2 = phi i32 [ 0, %b0 ], [ %v16, %b13 ]
+  br label %b3
+
+b3:                                               ; preds = %b3, %b1
+  %v4 = phi i32 [ %v2, %b1 ], [ %v10, %b3 ]
+  %v5 = phi i32 [ 0, %b1 ], [ %v11, %b3 ]
+  %v6 = and i32 %v4, 1
+  %v7 = icmp ne i32 %v6, 0
+  %v8 = lshr i32 %v4, 1
+  %v9 = xor i32 %v8, 123456789
+  %v10 = select i1 %v7, i32 %v9, i32 %v8
+  %v11 = add nuw nsw i32 %v5, 1
+  %v12 = icmp ne i32 %v11, 8
+  br i1 %v12, label %b3, label %b13
+
+b13:                                              ; preds = %b3
+  %v14 = phi i32 [ %v10, %b3 ]
+  %v15 = getelementptr inbounds [256 x i32], [256 x i32]* @A, i32 0, i32 %v2
+  store i32 %v14, i32* %v15, align 4
+  %v16 = add nuw nsw i32 %v2, 1
+  %v17 = icmp ne i32 %v16, 256
+  br i1 %v17, label %b1, label %b18
+
+b18:                                              ; preds = %b13
+  ret void
+}
+
+attributes #0 = { noinline nounwind "target-cpu"="hexagonv60" }
diff --git a/test/CodeGen/Hexagon/mulh.ll b/test/CodeGen/Hexagon/mulh.ll
new file mode 100644
index 0000000000000..0442e28d4089e
--- /dev/null
+++ b/test/CodeGen/Hexagon/mulh.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+target triple = "hexagon"
+
+; CHECK-LABEL: danny:
+; CHECK: r{{[0-9]+}} = mpy(r0,r1)  
+define i32 @danny(i32 %a0, i32 %a1) {
+b2:
+  %v3 = sext i32 %a0 to i64
+  %v4 = sext i32 %a1 to i64
+  %v5 = mul nsw i64 %v3, %v4
+  %v6 = ashr i64 %v5, 32
+  %v7 = trunc i64 %v6 to i32
+  ret i32 %v7
+}
+
+; CHECK-LABEL: sammy:
+; CHECK: r{{[0-9]+}} = mpy(r0,r1)
+define i32 @sammy(i32 %a0, i32 %a1) {
+b2:
+  %v3 = sext i32 %a0 to i64
+  %v4 = sext i32 %a1 to i64
+  %v5 = mul nsw i64 %v3, %v4
+  %v6 = lshr i64 %v5, 32
+  %v7 = trunc i64 %v6 to i32
+  ret i32 %v7
+}
diff --git a/test/CodeGen/Hexagon/mux-kill.mir b/test/CodeGen/Hexagon/mux-kill.mir
new file mode 100644
index 0000000000000..6944050e3dab1
--- /dev/null
+++ b/test/CodeGen/Hexagon/mux-kill.mir
@@ -0,0 +1,15 @@
+# RUN: llc -march=hexagon -run-pass hexagon-gen-mux -o - %s -verify-machineinstrs | FileCheck %s
+# CHECK: %r2 = C2_mux %p0, %r0, %r1
+---
+name: fred
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: %d0, %p0
+
+    %r2 = A2_tfrt %p0, %r0
+    %r0 = A2_tfr %r1
+    %r2 = A2_tfrf %p0, killed %r1
+...
+
diff --git a/test/CodeGen/Hexagon/mux-kill2.mir b/test/CodeGen/Hexagon/mux-kill2.mir
new file mode 100644
index 0000000000000..5f34097af7cf5
--- /dev/null
+++ b/test/CodeGen/Hexagon/mux-kill2.mir
@@ -0,0 +1,17 @@
+# RUN: llc -march=hexagon -run-pass hexagon-gen-mux -o - -verify-machineinstrs %s | FileCheck %s
+# CHECK: %r1 = C2_muxri %p0, 123, %r0
+# CHECK: %r2 = C2_muxir %p0, killed %r0, 321
+---
+name: fred
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: %r0, %p0
+
+    %r2 = A2_tfrt %p0, %r0
+    %r1 = C2_cmoveit %p0, 123
+    %r1 = A2_tfrf %p0, killed %r0, implicit killed %r1
+    %r2 = C2_cmoveif killed %p0, 321, implicit killed %r2
+...
+
diff --git a/test/CodeGen/Hexagon/store-imm-stack-object.ll b/test/CodeGen/Hexagon/store-imm-stack-object.ll
new file mode 100644
index 0000000000000..8de310953aee0
--- /dev/null
+++ b/test/CodeGen/Hexagon/store-imm-stack-object.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+target triple = "hexagon"
+
+; CHECK-LABEL: test1:
+; CHECK: [[REG1:(r[0-9]+)]] = ##875770417
+; CHECK-DAG: memw(r29+#4) = [[REG1]]
+; CHECK-DAG: memw(r29+#8) = #51
+; CHECK-DAG: memh(r29+#12) = #50
+; CHECK-DAG: memb(r29+#15) = #49
+define void @test1() {
+b0:
+  %v1 = alloca [1 x i8], align 1
+  %v2 = alloca i16, align 2
+  %v3 = alloca i32, align 4
+  %v4 = alloca i32, align 4
+  %v5 = getelementptr inbounds [1 x i8], [1 x i8]* %v1, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %v5)
+  store i8 49, i8* %v5, align 1
+  %v6 = bitcast i16* %v2 to i8*
+  call void @llvm.lifetime.start(i64 2, i8* %v6)
+  store i16 50, i16* %v2, align 2
+  %v7 = bitcast i32* %v3 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %v7)
+  store i32 51, i32* %v3, align 4
+  %v8 = bitcast i32* %v4 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %v8)
+  store i32 875770417, i32* %v4, align 4
+  call void @test4(i8* %v5, i8* %v6, i8* %v7, i8* %v8)
+  call void @llvm.lifetime.end(i64 4, i8* %v8)
+  call void @llvm.lifetime.end(i64 4, i8* %v7)
+  call void @llvm.lifetime.end(i64 2, i8* %v6)
+  call void @llvm.lifetime.end(i64 1, i8* %v5)
+  ret void
+}
+
+; CHECK-LABEL: test2:
+; CHECK-DAG: memw(r29+#208) = #51
+; CHECK-DAG: memh(r29+#212) = r{{[0-9]+}}
+; CHECK-DAG: memb(r29+#215) = r{{[0-9]+}}
+define void @test2() {
+b0:
+  %v1 = alloca [1 x i8], align 1
+  %v2 = alloca i16, align 2
+  %v3 = alloca i32, align 4
+  %v4 = alloca i32, align 4
+  %v5 = alloca [100 x i8], align 8
+  %v6 = alloca [101 x i8], align 8
+  %v7 = getelementptr inbounds [1 x i8], [1 x i8]* %v1, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %v7)
+  store i8 49, i8* %v7, align 1
+  %v8 = bitcast i16* %v2 to i8*
+  call void @llvm.lifetime.start(i64 2, i8* %v8)
+  store i16 50, i16* %v2, align 2
+  %v9 = bitcast i32* %v3 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %v9)
+  store i32 51, i32* %v3, align 4
+  %v10 = bitcast i32* %v4 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %v10)
+  store i32 875770417, i32* %v4, align 4
+  %v11 = getelementptr inbounds [100 x i8], [100 x i8]* %v5, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 100, i8* %v11)
+  call void @llvm.memset.p0i8.i32(i8* %v11, i8 0, i32 100, i32 8, i1 false)
+  store i8 50, i8* %v11, align 8
+  %v12 = getelementptr inbounds [101 x i8], [101 x i8]* %v6, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 101, i8* %v12)
+  call void @llvm.memset.p0i8.i32(i8* %v12, i8 0, i32 101, i32 8, i1 false)
+  store i8 49, i8* %v12, align 8
+  call void @test3(i8* %v7, i8* %v8, i8* %v9, i8* %v10, i8* %v11, i8* %v12)
+  call void @llvm.lifetime.end(i64 101, i8* %v12)
+  call void @llvm.lifetime.end(i64 100, i8* %v11)
+  call void @llvm.lifetime.end(i64 4, i8* %v10)
+  call void @llvm.lifetime.end(i64 4, i8* %v9)
+  call void @llvm.lifetime.end(i64 2, i8* %v8)
+  call void @llvm.lifetime.end(i64 1, i8* %v7)
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1) #0
+
+declare void @test3(i8*, i8*, i8*, i8*, i8*, i8*)
+declare void @test4(i8*, i8*, i8*, i8*)
+
+attributes #0 = { argmemonly nounwind "target-cpu"="hexagonv60" }
diff --git a/test/CodeGen/Mips/2008-06-05-Carry.ll b/test/CodeGen/Mips/2008-06-05-Carry.ll
index c61e1cdedea78..5e6092fc7848d 100644
--- a/test/CodeGen/Mips/2008-06-05-Carry.ll
+++ b/test/CodeGen/Mips/2008-06-05-Carry.ll
@@ -2,20 +2,21 @@
 
 define i64 @add64(i64 %u, i64 %v) nounwind  {
 entry:
+; CHECK-LABEL: add64:
 ; CHECK: addu
-; CHECK: sltu 
+; CHECK-DAG: sltu
+; CHECK-DAG: addu
 ; CHECK: addu
-; CHECK: addu
-  %tmp2 = add i64 %u, %v  
+  %tmp2 = add i64 %u, %v
   ret i64 %tmp2
 }
 
 define i64 @sub64(i64 %u, i64 %v) nounwind  {
 entry:
-; CHECK: sub64
+; CHECK-LABEL: sub64
+; CHECK-DAG: sltu
+; CHECK-DAG: subu
 ; CHECK: subu
-; CHECK: sltu 
-; CHECK: addu
 ; CHECK: subu
   %tmp2 = sub i64 %u, %v
   ret i64 %tmp2
diff --git a/test/CodeGen/Mips/brundef.ll b/test/CodeGen/Mips/brundef.ll
new file mode 100644
index 0000000000000..802556c7cabd1
--- /dev/null
+++ b/test/CodeGen/Mips/brundef.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=mips -mcpu=mips32 -verify-machineinstrs -o /dev/null < %s 
+; Confirm that MachineInstr branch simplification preserves
+; register operand flags, such as the <undef> flag.
+
+define void @ham() {
+bb:
+  %tmp = alloca i32, align 4
+  %tmp13 = ptrtoint i32* %tmp to i32
+  %tmp70 = icmp eq i32 undef, -1
+  br i1 %tmp70, label %bb72, label %bb40
+
+bb72:                                             ; preds = %bb72, %bb
+  br i1 undef, label %bb40, label %bb72
+
+bb40:                                             ; preds = %bb72, %bb
+  %tmp41 = phi i32 [ %tmp13, %bb72 ], [ %tmp13, %bb ]
+  %tmp55 = inttoptr i32 %tmp41 to i32*
+  %tmp58 = insertelement <2 x i32*> undef, i32* %tmp55, i32 1
+  br label %bb59
+
+bb59:                                             ; preds = %bb59, %bb40
+  %tmp60 = phi <2 x i32*> [ %tmp61, %bb59 ], [ %tmp58, %bb40 ]
+  %tmp61 = getelementptr i32, <2 x i32*> %tmp60, <2 x i32> <i32 -1, i32 1>
+  %tmp62 = extractelement <2 x i32*> %tmp61, i32 1
+  br label %bb59
+}
diff --git a/test/CodeGen/Mips/dsp-patterns.ll b/test/CodeGen/Mips/dsp-patterns.ll
index 837c0d8bfc52b..250d3eff37dc5 100644
--- a/test/CodeGen/Mips/dsp-patterns.ll
+++ b/test/CodeGen/Mips/dsp-patterns.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mips -mattr=dsp < %s | FileCheck %s -check-prefix=R1
-; RUN: llc -march=mips -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dsp < %s | FileCheck %s -check-prefix=R1
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2
 
 ; R1-LABEL: test_lbux:
 ; R1: lbux ${{[0-9]+}}
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
index fcf129420234c..b7cc6fc8ea757 100644
--- a/test/CodeGen/Mips/llcarry.ll
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -14,9 +14,9 @@ entry:
   %add = add nsw i64 %1, %0
   store i64 %add, i64* @k, align 8
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
-; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   ret void
 }
@@ -28,8 +28,8 @@ entry:
   %sub = sub nsw i64 %0, %1
 ; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
-; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   store i64 %sub, i64* @l, align 8
   ret void
@@ -41,8 +41,7 @@ entry:
   %add = add nsw i64 %0, 15
 ; 16:	addiu	${{[0-9]+}}, 15
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
-; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   store i64 %add, i64* @m, align 8
   ret void
diff --git a/test/CodeGen/Mips/llvm-ir/add.ll b/test/CodeGen/Mips/llvm-ir/add.ll
index a5ecdda94ce2f..63884eb03b8c5 100644
--- a/test/CodeGen/Mips/llvm-ir/add.ll
+++ b/test/CodeGen/Mips/llvm-ir/add.ll
@@ -1,35 +1,35 @@
 ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,PRE4
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP32
 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -O2 -verify-machineinstrs | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MMR6,MM32
+; RUN:    -check-prefixes=ALL,MMR3,MM32
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -O2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,MMR6,MM32
 ; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -O2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MMR6,MM64
+; RUN:    -check-prefixes=ALL,MM64
 
 
 ; FIXME: This code sequence is inefficient as it should be 'subu $[[T0]], $zero, $[[T0]'. 
@@ -110,17 +110,17 @@ define signext i64 @add_i64(i64 signext %a, i64 signext %b) {
 entry:
 ; ALL-LABEL: add_i64:
 
-  ; GP32:       addu    $3, $5, $7
-  ; GP32:       sltu    $[[T0:[0-9]+]], $3, $7
-  ; GP32:       addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP32:       addu    $2, $4, $[[T1]]
+  ; GP32-DAG:   addu    $[[T0:[0-9]+]], $4, $6
+  ; GP32-DAG:   addu    $3, $5, $7
+  ; GP32:       sltu    $[[T1:[0-9]+]], $3, $5
+  ; GP32:       addu    $2, $[[T0]], $[[T1]]
 
   ; GP64:       daddu   $2, $4, $5
 
-  ; MM32:       addu16  $3, $5, $7
-  ; MM32:       sltu    $[[T0:[0-9]+]], $3, $7
-  ; MM32:       addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; MM32:       addu    $2, $4, $[[T1]]
+  ; MM32-DAG:   addu16  $3, $5, $7
+  ; MM32-DAG:   addu16  $[[T0:[0-9]+]], $4, $6
+  ; MM32:       sltu    $[[T1:[0-9]+]], $3, $5
+  ; MM32:       addu16  $2, $[[T0]], $[[T1]]
 
   ; MM64:       daddu   $2, $4, $5
 
@@ -132,49 +132,108 @@ define signext i128 @add_i128(i128 signext %a, i128 signext %b) {
 entry:
 ; ALL-LABEL: add_i128:
 
-  ; GP32:       lw        $[[T0:[0-9]+]], 28($sp)
-  ; GP32:       addu      $[[T1:[0-9]+]], $7, $[[T0]]
-  ; GP32:       sltu      $[[T2:[0-9]+]], $[[T1]], $[[T0]]
-  ; GP32:       lw        $[[T3:[0-9]+]], 24($sp)
-  ; GP32:       addu      $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; GP32:       addu      $[[T5:[0-9]+]], $6, $[[T4]]
-  ; GP32:       sltu      $[[T6:[0-9]+]], $[[T5]], $[[T3]]
-  ; GP32:       lw        $[[T7:[0-9]+]], 20($sp)
-  ; GP32:       addu      $[[T8:[0-9]+]], $[[T6]], $[[T7]]
-  ; GP32:       lw        $[[T9:[0-9]+]], 16($sp)
-  ; GP32:       addu      $3, $5, $[[T8]]
-  ; GP32:       sltu      $[[T10:[0-9]+]], $3, $[[T7]]
-  ; GP32:       addu      $[[T11:[0-9]+]], $[[T10]], $[[T9]]
-  ; GP32:       addu      $2, $4, $[[T11]]
-  ; GP32:       move      $4, $[[T5]]
-  ; GP32:       move      $5, $[[T1]]
-
-  ; GP64:       daddu     $3, $5, $7
-  ; GP64:       sltu      $[[T0:[0-9]+]], $3, $7
-  ; GP64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP64:       daddu     $2, $4, $[[T1]]
-
-  ; MM32:       lw        $[[T0:[0-9]+]], 28($sp)
-  ; MM32:       addu      $[[T1:[0-9]+]], $7, $[[T0]]
-  ; MM32:       sltu      $[[T2:[0-9]+]], $[[T1]], $[[T0]]
-  ; MM32:       lw        $[[T3:[0-9]+]], 24($sp)
-  ; MM32:       addu16    $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; MM32:       addu16    $[[T5:[0-9]+]], $6, $[[T4]]
-  ; MM32:       sltu      $[[T6:[0-9]+]], $[[T5]], $[[T3]]
-  ; MM32:       lw        $[[T7:[0-9]+]], 20($sp)
-  ; MM32:       addu16    $[[T8:[0-9]+]], $[[T6]], $[[T7]]
-  ; MM32:       lw        $[[T9:[0-9]+]], 16($sp)
-  ; MM32:       addu16    $[[T10:[0-9]+]], $5, $[[T8]]
-  ; MM32:       sltu      $[[T11:[0-9]+]], $[[T10]], $[[T7]]
-  ; MM32:       addu      $[[T12:[0-9]+]], $[[T11]], $[[T9]]
-  ; MM32:       addu16    $[[T13:[0-9]+]], $4, $[[T12]]
-  ; MM32:       move      $4, $[[T5]]
-  ; MM32:       move      $5, $[[T1]]
-
+  ; PRE4:       move    $[[R1:[0-9]+]], $5
+  ; PRE4:       move    $[[R2:[0-9]+]], $4
+  ; PRE4:       lw   $[[R3:[0-9]+]], 24($sp)
+  ; PRE4:       addu   $[[R4:[0-9]+]], $6, $[[R3]]
+  ; PRE4:       lw   $[[R5:[0-9]+]], 28($sp)
+  ; PRE4:       addu   $[[R6:[0-9]+]], $7, $[[R5]]
+  ; PRE4:       sltu   $[[R7:[0-9]+]], $[[R6]], $7
+  ; PRE4:       addu   $[[R8:[0-9]+]], $[[R4]], $[[R7]]
+  ; PRE4:       xor   $[[R9:[0-9]+]], $[[R8]], $6
+  ; PRE4:       sltiu   $[[R10:[0-9]+]], $[[R9]], 1
+  ; PRE4:       bnez   $[[R10]], $BB5_2
+  ; PRE4:       sltu   $[[R7]], $[[R8]], $6
+  ; PRE4:       lw   $[[R12:[0-9]+]], 20($sp)
+  ; PRE4:       addu   $[[R13:[0-9]+]], $[[R1]], $[[R12]]
+  ; PRE4:       lw   $[[R14:[0-9]+]], 16($sp)
+  ; PRE4:       addu   $[[R15:[0-9]+]], $[[R13]], $[[R7]]
+  ; PRE4:       addu   $[[R16:[0-9]+]], $[[R2]], $[[R14]]
+  ; PRE4:       sltu   $[[R17:[0-9]+]], $[[R15]], $[[R13]]
+  ; PRE4:       sltu   $[[R18:[0-9]+]], $[[R13]], $[[R1]]
+  ; PRE4:       addu   $[[R19:[0-9]+]], $[[R16]], $[[R18]]
+  ; PRE4:       addu   $2, $[[R19]], $[[R17]]
+
+  ; GP32-CMOV:  lw        $[[T0:[0-9]+]], 24($sp)
+  ; GP32-CMOV:  addu      $[[T1:[0-9]+]], $6, $[[T0]]
+  ; GP32-CMOV:  lw        $[[T2:[0-9]+]], 28($sp)
+  ; GP32-CMOV:  addu      $[[T3:[0-9]+]], $7, $[[T2]]
+  ; GP32-CMOV:  sltu      $[[T4:[0-9]+]], $[[T3]], $7
+  ; GP32-CMOV:  addu      $[[T5:[0-9]+]], $[[T1]], $[[T4]]
+  ; GP32-CMOV:  sltu      $[[T6:[0-9]+]], $[[T5]], $6
+  ; GP32-CMOV:  xor       $[[T7:[0-9]+]], $[[T5]], $6
+  ; GP32-CMOV:  movz      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; GP32-CMOV:  lw        $[[T9:[0-9]+]], 20($sp)
+  ; GP32-CMOV:  addu      $[[T10:[0-9]+]], $5, $[[T4]]
+  ; GP32-CMOV:  addu      $[[T11:[0-9]+]], $[[T10]], $[[T8]]
+  ; GP32-CMOV:  lw        $[[T12:[0-9]+]], 16($sp)
+  ; GP32-CMOV:  sltu      $[[T13:[0-9]+]], $[[T11]], $[[T10]]
+  ; GP32-CMOV:  addu      $[[T14:[0-9]+]], $4, $[[T12]]
+  ; GP32-CMOV:  sltu      $[[T15:[0-9]+]], $[[T10]], $5
+  ; GP32-CMOV:  addu      $[[T16:[0-9]+]], $[[T14]], $[[T15]]
+  ; GP32-CMOV:  addu      $[[T17:[0-9]+]], $[[T16]], $[[T13]]
+  ; GP32-CMOV:  move      $4, $[[T5]]
+  ; GP32-CMOV:  move      $5, $[[T3]]
+
+  ; GP64:           daddu   $[[T0:[0-9]+]], $4, $6
+  ; GP64:           daddu   $[[T1:[0-9]+]], $5, $7
+  ; GP64:           sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; GP64-NOT-R2-R6: dsll    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T4:[0-9]+]], $[[T3]], 32
+  ; GP64-R2-R6:     dext    $[[T4:[0-9]+]], $[[T2]], 0, 32
+
+  ; GP64:           daddu   $2, $[[T0]], $[[T4]]
+
+  ; MMR3:       move      $[[T1:[0-9]+]], $5
+  ; MMR3-DAG:   lw        $[[T2:[0-9]+]], 32($sp)
+  ; MMR3:       addu16    $[[T3:[0-9]+]], $6, $[[T2]]
+  ; MMR3-DAG:   lw        $[[T4:[0-9]+]], 36($sp)
+  ; MMR3:       addu16    $[[T5:[0-9]+]], $7, $[[T4]]
+  ; MMR3:       sltu      $[[T6:[0-9]+]], $[[T5]], $7
+  ; MMR3:       addu16    $[[T7:[0-9]+]], $[[T3]], $[[T6]]
+  ; MMR3:       sltu      $[[T8:[0-9]+]], $[[T7]], $6
+  ; MMR3:       xor       $[[T9:[0-9]+]], $[[T7]], $6
+  ; MMR3:       movz      $[[T8]], $[[T6]], $[[T9]]
+  ; MMR3:       lw        $[[T10:[0-9]+]], 28($sp)
+  ; MMR3:       addu16    $[[T11:[0-9]+]], $[[T1]], $[[T10]]
+  ; MMR3:       addu16    $[[T12:[0-9]+]], $[[T11]], $[[T8]]
+  ; MMR3:       lw        $[[T13:[0-9]+]], 24($sp)
+  ; MMR3:       sltu      $[[T14:[0-9]+]], $[[T12]], $[[T11]]
+  ; MMR3:       addu16    $[[T15:[0-9]+]], $4, $[[T13]]
+  ; MMR3:       sltu      $[[T16:[0-9]+]], $[[T11]], $[[T1]]
+  ; MMR3:       addu16    $[[T17:[0-9]+]], $[[T15]], $[[T16]]
+  ; MMR3:       addu16    $2, $2, $[[T14]]
+
+  ; MMR6:        move      $[[T1:[0-9]+]], $5
+  ; MMR6:        move      $[[T2:[0-9]+]], $4
+  ; MMR6:        lw        $[[T3:[0-9]+]], 32($sp)
+  ; MMR6:        addu16    $[[T4:[0-9]+]], $6, $[[T3]]
+  ; MMR6:        lw        $[[T5:[0-9]+]], 36($sp)
+  ; MMR6:        addu16    $[[T6:[0-9]+]], $7, $[[T5]]
+  ; MMR6:        sltu      $[[T7:[0-9]+]], $[[T6]], $7
+  ; MMR6:        addu16    $[[T8:[0-9]+]], $[[T4]], $7
+  ; MMR6:        sltu      $[[T9:[0-9]+]], $[[T8]], $6
+  ; MMR6:        xor       $[[T10:[0-9]+]], $[[T4]], $6
+  ; MMR6:        sltiu     $[[T11:[0-9]+]], $[[T10]], 1
+  ; MMR6:        seleqz    $[[T12:[0-9]+]], $[[T9]], $[[T11]]
+  ; MMR6:        selnez    $[[T13:[0-9]+]], $[[T7]], $[[T11]]
+  ; MMR6:        lw        $[[T14:[0-9]+]], 24($sp)
+  ; MMR6:        or        $[[T15:[0-9]+]], $[[T13]], $[[T12]]
+  ; MMR6:        addu16    $[[T16:[0-9]+]], $[[T2]], $[[T14]]
+  ; MMR6:        lw        $[[T17:[0-9]+]], 28($sp)
+  ; MMR6:        addu16    $[[T18:[0-9]+]], $[[T1]], $[[T17]]
+  ; MMR6:        addu16    $[[T19:[0-9]+]], $[[T18]], $[[T15]]
+  ; MMR6:        sltu      $[[T20:[0-9]+]], $[[T18]], $[[T1]]
+  ; MMR6:        sltu      $[[T21:[0-9]+]], $[[T17]], $[[T18]]
+  ; MMR6:        addu16    $2, $[[T16]], $[[T20]]
+  ; MMR6:        addu16    $2, $[[T20]], $[[T21]]
+
+  ; MM64:       daddu     $[[T0:[0-9]+]], $4, $6
   ; MM64:       daddu     $3, $5, $7
-  ; MM64:       sltu      $[[T0:[0-9]+]], $3, $7
-  ; MM64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; MM64:       daddu     $2, $4, $[[T1]]
+  ; MM64:       sltu      $[[T1:[0-9]+]], $3, $5
+  ; MM64:       dsll      $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl      $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu     $2, $[[T0]], $[[T3]]
 
   %r = add i128 %a, %b
   ret i128 %r
@@ -249,17 +308,16 @@ define signext i32 @add_i32_4(i32 signext %a) {
 define signext i64 @add_i64_4(i64 signext %a) {
 ; ALL-LABEL: add_i64_4:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $5, 4
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 4
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $2, $4, $[[T1]]
+  ; GP32:       addiu   $3, $5, 4
+  ; GP32:       sltu    $[[T0:[0-9]+]], $3, $5
+  ; GP32:       addu    $2, $4, $[[T0]]
+
+  ; MM32:       addiur2 $[[T1:[0-9]+]], $5, 4
+  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; MM32:       addu16  $2, $4, $[[T2]]
 
   ; GP64:       daddiu  $2, $4, 4
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $5, 4
-  ; MM32:       li16    $[[T1:[0-9]+]], 4
-  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T0]], $[[T1]]
-  ; MM32:       addu    $2, $4, $[[T2]]
 
   ; MM64:       daddiu  $2, $4, 4
 
@@ -270,38 +328,67 @@ define signext i64 @add_i64_4(i64 signext %a) {
 define signext i128 @add_i128_4(i128 signext %a) {
 ; ALL-LABEL: add_i128_4:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $7, 4
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 4
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $[[T2:[0-9]+]], $6, $[[T1]]
-  ; GP32:       sltu    $[[T1]], $[[T2]], $zero
-  ; GP32:       addu    $[[T3:[0-9]+]], $5, $[[T1]]
-  ; GP32:       sltu    $[[T1]], $[[T3]], $zero
-  ; GP32:       addu    $[[T1]], $4, $[[T1]]
-  ; GP32:       move    $4, $[[T2]]
-  ; GP32:       move    $5, $[[T0]]
-
-  ; GP64:       daddiu  $[[T0:[0-9]+]], $5, 4
-  ; GP64:       daddiu  $[[T1:[0-9]+]], $zero, 4
-  ; GP64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP64:       daddu   $2, $4, $[[T1]]
-
-  ; MM32:       addiu   $[[T0:[0-9]+]], $7, 4
-  ; MM32:       li16    $[[T1:[0-9]+]], 4
-  ; MM32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM32:       addu16  $[[T2:[0-9]+]], $6, $[[T1]]
-  ; MM32:       li16    $[[T1]], 0
-  ; MM32:       sltu    $[[T3:[0-9]+]], $[[T2]], $[[T1]]
-  ; MM32:       addu16  $[[T3]], $5, $[[T3]]
-  ; MM32:       sltu    $[[T1]], $[[T3]], $[[T1]]
-  ; MM32:       addu16  $[[T1]], $4, $[[T1]]
-  ; MM32:       move    $4, $[[T2]]
-  ; MM32:       move    $5, $[[T0]]
+  ; PRE4:       move   $[[T0:[0-9]+]], $5
+  ; PRE4:       addiu  $[[T1:[0-9]+]], $7, 4
+  ; PRE4:       sltu   $[[T2:[0-9]+]], $[[T1]], $7
+  ; PRE4:       xori   $[[T3:[0-9]+]], $[[T2]], 1
+  ; PRE4:       bnez   $[[T3]], $BB[[BB0:[0-9_]+]]
+  ; PRE4:       addu   $[[T4:[0-9]+]], $6, $[[T2]]
+  ; PRE4:       sltu   $[[T5:[0-9]+]], $[[T4]], $6
+  ; PRE4;       $BB[[BB0:[0-9]+]]:
+  ; PRE4:       addu   $[[T6:[0-9]+]], $[[T0]], $[[T5]]
+  ; PRE4:       sltu   $[[T7:[0-9]+]], $[[T6]], $[[T0]]
+  ; PRE4:       addu   $[[T8:[0-9]+]], $4, $[[T7]]
+  ; PRE4:       move    $4, $[[T4]]
+
+  ; GP32-CMOV:  addiu   $[[T0:[0-9]+]], $7, 4
+  ; GP32-CMOV:  sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; GP32-CMOV:  addu    $[[T2:[0-9]+]], $6, $[[T1]]
+  ; GP32-CMOV:  sltu    $[[T3:[0-9]+]], $[[T2]], $6
+  ; GP32-CMOV:  movz    $[[T3]], $[[T1]], $[[T1]]
+  ; GP32-CMOV:  addu    $[[T4:[0-9]+]], $5, $[[T3]]
+  ; GP32-CMOV:  sltu    $[[T5:[0-9]+]], $[[T4]], $5
+  ; GP32-CMOV:  addu    $[[T7:[0-9]+]], $4, $[[T5]]
+  ; GP32-CMOV:  move    $4, $[[T2]]
+  ; GP32-CMOV:  move    $5, $[[T0]]
+
+  ; GP64:           daddiu  $[[T0:[0-9]+]], $5, 4
+  ; GP64:           sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; GP64-NOT-R2-R6: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-R2-R6:     dext    $[[T3:[0-9]+]], $[[T1]], 0, 32
+
+  ; GP64:           daddu   $2, $4, $[[T3]]
+
+  ; MMR3:       addiur2 $[[T0:[0-9]+]], $7, 4
+  ; MMR3:       sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; MMR3:       sltu    $[[T2:[0-9]+]], $[[T0]], $7
+  ; MMR3:       addu16  $[[T3:[0-9]+]], $6, $[[T2]]
+  ; MMR3:       sltu    $[[T4:[0-9]+]], $[[T3]], $6
+  ; MMR3:       movz    $[[T4]], $[[T2]], $[[T1]]
+  ; MMR3:       addu16  $[[T6:[0-9]+]], $5, $[[T4]]
+  ; MMR3:       sltu    $[[T7:[0-9]+]], $[[T6]], $5
+  ; MMR3:       addu16  $2, $4, $[[T7]]
+
+  ; MMR6: addiur2 $[[T1:[0-9]+]], $7, 4
+  ; MMR6: sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR6: xori    $[[T3:[0-9]+]], $[[T2]], 1
+  ; MMR6: selnez  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; MMR6: addu16  $[[T5:[0-9]+]], $6, $[[T2]]
+  ; MMR6: sltu    $[[T6:[0-9]+]], $[[T5]], $6
+  ; MMR6: seleqz  $[[T7:[0-9]+]], $[[T6]], $[[T3]]
+  ; MMR6: or      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; MMR6: addu16  $[[T9:[0-9]+]], $5, $[[T8]]
+  ; MMR6: sltu    $[[T10:[0-9]+]], $[[T9]], $5
+  ; MMR6: addu16  $[[T11:[0-9]+]], $4, $[[T10]]
+  ; MMR6: move    $4, $7
+  ; MMR6: move    $5, $[[T1]]
 
   ; MM64:       daddiu  $[[T0:[0-9]+]], $5, 4
-  ; MM64:       daddiu  $[[T1:[0-9]+]], $zero, 4
-  ; MM64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM64:       daddu   $2, $4, $[[T1]]
+  ; MM64:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; MM64:       dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu   $2, $4, $[[T3]]
 
   %r = add i128 4, %a
   ret i128 %r
@@ -380,16 +467,15 @@ define signext i64 @add_i64_3(i64 signext %a) {
 ; ALL-LABEL: add_i64_3:
 
   ; GP32:       addiu   $[[T0:[0-9]+]], $5, 3
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 3
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
+  ; GP32:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
   ; GP32:       addu    $2, $4, $[[T1]]
 
   ; GP64:       daddiu  $2, $4, 3
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $5, 3
-  ; MM32:       li16    $[[T1:[0-9]+]], 3
-  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T0]], $[[T1]]
-  ; MM32:       addu    $2, $4, $[[T2]]
+  ; MM32:       move    $[[T1:[0-9]+]], $5
+  ; MM32:       addius5 $[[T1]], 3
+  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; MM32:       addu16  $2, $4, $[[T2]]
 
   ; MM64:       daddiu  $2, $4, 3
 
@@ -400,38 +486,70 @@ define signext i64 @add_i64_3(i64 signext %a) {
 define signext i128 @add_i128_3(i128 signext %a) {
 ; ALL-LABEL: add_i128_3:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $7, 3
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 3
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $[[T2:[0-9]+]], $6, $[[T1]]
-  ; GP32:       sltu    $[[T3:[0-9]+]], $[[T2]], $zero
-  ; GP32:       addu    $[[T4:[0-9]+]], $5, $[[T3]]
-  ; GP32:       sltu    $[[T5:[0-9]+]], $[[T4]], $zero
-  ; GP32:       addu    $[[T5]], $4, $[[T5]]
-  ; GP32:       move    $4, $[[T2]]
-  ; GP32:       move    $5, $[[T0]]
-
-  ; GP64:       daddiu  $[[T0:[0-9]+]], $5, 3
-  ; GP64:       daddiu  $[[T1:[0-9]+]], $zero, 3
-  ; GP64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP64:       daddu   $2, $4, $[[T1]]
-
-  ; MM32:       addiu   $[[T0:[0-9]+]], $7, 3
-  ; MM32:       li16    $[[T1:[0-9]+]], 3
-  ; MM32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM32:       addu16  $[[T2:[0-9]+]], $6, $[[T1]]
-  ; MM32:       li16    $[[T3:[0-9]+]], 0
-  ; MM32:       sltu    $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; MM32:       addu16  $[[T4]], $5, $[[T4]]
-  ; MM32:       sltu    $[[T5:[0-9]+]], $[[T4]], $[[T3]]
-  ; MM32:       addu16  $[[T5]], $4, $[[T5]]
-  ; MM32:       move    $4, $[[T2]]
-  ; MM32:       move    $5, $[[T0]]
+  ; PRE4:       move   $[[T0:[0-9]+]], $5
+  ; PRE4:       addiu  $[[T1:[0-9]+]], $7, 3
+  ; PRE4:       sltu   $[[T2:[0-9]+]], $[[T1]], $7
+  ; PRE4:       xori   $[[T3:[0-9]+]], $[[T2]], 1
+  ; PRE4:       bnez   $[[T3]], $BB[[BB0:[0-9_]+]]
+  ; PRE4:       addu   $[[T4:[0-9]+]], $6, $[[T2]]
+  ; PRE4:       sltu   $[[T5:[0-9]+]], $[[T4]], $6
+  ; PRE4;       $BB[[BB0:[0-9]+]]:
+  ; PRE4:       addu   $[[T6:[0-9]+]], $[[T0]], $[[T5]]
+  ; PRE4:       sltu   $[[T7:[0-9]+]], $[[T6]], $[[T0]]
+  ; PRE4:       addu   $[[T8:[0-9]+]], $4, $[[T7]]
+  ; PRE4:       move    $4, $[[T4]]
+
+  ; GP32-CMOV:  addiu   $[[T0:[0-9]+]], $7, 3
+  ; GP32-CMOV:  sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; GP32-CMOV:  addu    $[[T2:[0-9]+]], $6, $[[T1]]
+  ; GP32-CMOV:  sltu    $[[T3:[0-9]+]], $[[T2]], $6
+  ; GP32-CMOV:  movz    $[[T3]], $[[T1]], $[[T1]]
+  ; GP32-CMOV:  addu    $[[T4:[0-9]+]], $5, $[[T3]]
+  ; GP32-CMOV:  sltu    $[[T5:[0-9]+]], $[[T4]], $5
+  ; GP32-CMOV:  addu    $[[T7:[0-9]+]], $4, $[[T5]]
+  ; GP32-CMOV:  move    $4, $[[T2]]
+  ; GP32-CMOV:  move    $5, $[[T0]]
+
+  ; GP64:           daddiu  $[[T0:[0-9]+]], $5, 3
+  ; GP64:           sltu    $[[T1:[0-9]+]], $[[T0]], $5
+
+  ; GP64-NOT-R2-R6: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-R2-R6:     dext    $[[T3:[0-9]+]], $[[T1]], 0, 32
+
+  ; GP64:           daddu   $2, $4, $[[T3]]
+
+  ; MMR3:       move    $[[T1:[0-9]+]], $7
+  ; MMR3:       addius5 $[[T1]], 3
+  ; MMR3:       sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR3:       sltu    $[[T3:[0-9]+]], $[[T1]], $7
+  ; MMR3:       addu16  $[[T4:[0-9]+]], $6, $[[T3]]
+  ; MMR3:       sltu    $[[T5:[0-9]+]], $[[T4]], $6
+  ; MMR3:       movz    $[[T5]], $[[T3]], $[[T2]]
+  ; MMR3:       addu16  $[[T6:[0-9]+]], $5, $[[T5]]
+  ; MMR3:       sltu    $[[T7:[0-9]+]], $[[T6]], $5
+  ; MMR3:       addu16  $2, $4, $[[T7]]
+
+  ; MMR6: move    $[[T1:[0-9]+]], $7
+  ; MMR6: addius5 $[[T1]], 3
+  ; MMR6: sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR6: xori    $[[T3:[0-9]+]], $[[T2]], 1
+  ; MMR6: selnez  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; MMR6: addu16  $[[T5:[0-9]+]], $6, $[[T2]]
+  ; MMR6: sltu    $[[T6:[0-9]+]], $[[T5]], $6
+  ; MMR6: seleqz  $[[T7:[0-9]+]], $[[T6]], $[[T3]]
+  ; MMR6: or      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; MMR6: addu16  $[[T9:[0-9]+]], $5, $[[T8]]
+  ; MMR6: sltu    $[[T10:[0-9]+]], $[[T9]], $5
+  ; MMR6: addu16  $[[T11:[0-9]+]], $4, $[[T10]]
+  ; MMR6: move    $4, $[[T5]]
+  ; MMR6: move    $5, $[[T1]]
 
   ; MM64:       daddiu  $[[T0:[0-9]+]], $5, 3
-  ; MM64:       daddiu  $[[T1:[0-9]+]], $zero, 3
-  ; MM64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM64:       daddu   $2, $4, $[[T1]]
+  ; MM64:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; MM64:       dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu   $2, $4, $[[T3]]
 
   %r = add i128 3, %a
   ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/sub.ll b/test/CodeGen/Mips/llvm-ir/sub.ll
index a730063c552f4..655addb10a64e 100644
--- a/test/CodeGen/Mips/llvm-ir/sub.ll
+++ b/test/CodeGen/Mips/llvm-ir/sub.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM,PRE4
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
 ; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
@@ -11,25 +11,25 @@
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -verify-machineinstrs | FileCheck %s \
-; RUN:    -check-prefixes=GP32-MM,GP32,MM
+; RUN:    -check-prefixes=GP32-MM,GP32,MM32,MMR3
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=GP32-MM,GP32,MM
+; RUN:    -check-prefixes=GP32-MM,GP32,MM32,MMR6
 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=GP64,MM
+; RUN:    -check-prefixes=GP64,MM64
 
 define signext i1 @sub_i1(i1 signext %a, i1 signext %b) {
 entry:
@@ -100,10 +100,15 @@ define signext i64 @sub_i64(i64 signext %a, i64 signext %b) {
 entry:
 ; ALL-LABEL: sub_i64:
 
-  ; GP32-NOT-MM     subu    $3, $5, $7
-  ; GP32:           sltu    $[[T0:[0-9]+]], $5, $7
-  ; GP32:           addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP32:           subu    $2, $4, $[[T1]]
+  ; GP32-NOT-MM:    sltu    $[[T0:[0-9]+]], $5, $7
+  ; GP32-NOT-MM:    subu    $2, $4, $6
+  ; GP32-NOT-MM:    subu    $2, $2, $[[T0]]
+  ; GP32-NOT-MM:    subu    $3, $5, $7
+
+  ; MM32:           sltu    $[[T0:[0-9]+]], $5, $7
+  ; MM32:           subu16    $3, $4, $6
+  ; MM32:           subu16    $2, $3, $[[T0]]
+  ; MM32:           subu16    $3, $5, $7
 
   ; GP64:           dsubu   $2, $4, $5
 
@@ -115,42 +120,109 @@ define signext i128 @sub_i128(i128 signext %a, i128 signext %b) {
 entry:
 ; ALL-LABEL: sub_i128:
 
-  ; GP32-NOT-MM:    lw        $[[T0:[0-9]+]], 20($sp)
-  ; GP32-NOT-MM:    sltu      $[[T1:[0-9]+]], $5, $[[T0]]
-  ; GP32-NOT-MM:    lw        $[[T2:[0-9]+]], 16($sp)
-  ; GP32-NOT-MM:    addu      $[[T3:[0-9]+]], $[[T1]], $[[T2]]
-  ; GP32-NOT-MM:    lw        $[[T4:[0-9]+]], 24($sp)
-  ; GP32-NOT-MM:    lw        $[[T5:[0-9]+]], 28($sp)
-  ; GP32-NOT-MM:    subu      $[[T6:[0-9]+]], $7, $[[T5]]
-  ; GP32-NOT-MM:    subu      $2, $4, $[[T3]]
-  ; GP32-NOT-MM:    sltu      $[[T8:[0-9]+]], $6, $[[T4]]
-  ; GP32-NOT-MM:    addu      $[[T9:[0-9]+]], $[[T8]], $[[T0]]
-  ; GP32-NOT-MM:    subu      $3, $5, $[[T9]]
-  ; GP32-NOT-MM:    sltu      $[[T10:[0-9]+]], $7, $[[T5]]
-  ; GP32-NOT-MM:    addu      $[[T11:[0-9]+]], $[[T10]], $[[T4]]
-  ; GP32-NOT-MM:    subu      $4, $6, $[[T11]]
-  ; GP32-NOT-MM:    move      $5, $[[T6]]
-
-  ; GP32-MM:        lw        $[[T0:[0-9]+]], 20($sp)
-  ; GP32-MM:        sltu      $[[T1:[0-9]+]], $[[T2:[0-9]+]], $[[T0]]
-  ; GP32-MM:        lw        $[[T3:[0-9]+]], 16($sp)
-  ; GP32-MM:        addu      $[[T3]], $[[T1]], $[[T3]]
-  ; GP32-MM:        lw        $[[T4:[0-9]+]], 24($sp)
-  ; GP32-MM:        lw        $[[T5:[0-9]+]], 28($sp)
-  ; GP32-MM:        subu      $[[T1]], $7, $[[T5]]
-  ; GP32-MM:        subu16    $[[T3]], $[[T6:[0-9]+]], $[[T3]]
-  ; GP32-MM:        sltu      $[[T6]], $6, $[[T4]]
-  ; GP32-MM:        addu16    $[[T0]], $[[T6]], $[[T0]]
-  ; GP32-MM:        subu16    $[[T0]], $5, $[[T0]]
-  ; GP32-MM:        sltu      $[[T6]], $7, $[[T5]]
-  ; GP32-MM:        addu      $[[T6]], $[[T6]], $[[T4]]
-  ; GP32-MM:        subu16    $[[T6]], $6, $[[T6]]
-  ; GP32-MM:        move      $[[T2]], $[[T1]]
-
-  ; GP64:           dsubu     $3, $5, $7
-  ; GP64:           sltu      $[[T0:[0-9]+]], $5, $7
-  ; GP64:           daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP64:           dsubu     $2, $4, $[[T1]]
+; PRE4: lw     $[[T0:[0-9]+]], 24($sp)
+; PRE4: lw     $[[T1:[0-9]+]], 28($sp)
+; PRE4: sltu   $[[T2:[0-9]+]], $7, $[[T1]]
+; PRE4: xor    $[[T3:[0-9]+]], $6, $[[T0]]
+; PRE4: sltiu  $[[T4:[0-9]+]], $[[T3]], 1
+; PRE4: bnez   $[[T4]]
+; PRE4: move   $[[T5:[0-9]+]], $[[T2]]
+; PRE4: sltu   $[[T5]], $6, $[[T0]]
+
+; PRE4: lw     $[[T6:[0-9]+]], 20($sp)
+; PRE4: subu   $[[T7:[0-9]+]], $5, $[[T6]]
+; PRE4: subu   $[[T8:[0-9]+]], $[[T7]], $[[T5]]
+; PRE4: sltu   $[[T9:[0-9]+]], $[[T7]], $[[T5]]
+; PRE4: sltu   $[[T10:[0-9]+]], $5, $[[T6]]
+; PRE4: lw     $[[T11:[0-9]+]], 16($sp)
+; PRE4: subu   $[[T12:[0-9]+]], $4, $[[T11]]
+; PRE4: subu   $[[T13:[0-9]+]], $[[T12]], $[[T10]]
+; PRE4: subu   $[[T14:[0-9]+]], $[[T13]], $[[T9]]
+; PRE4: subu   $[[T15:[0-9]+]], $6, $[[T0]]
+; PRE4: subu   $[[T16:[0-9]+]], $[[T15]], $[[T2]]
+; PRE4: subu   $5, $7, $[[T1]]
+
+; MMR3: lw       $[[T1:[0-9]+]], 48($sp)
+; MMR3: sltu     $[[T2:[0-9]+]], $6, $[[T1]]
+; MMR3: xor      $[[T3:[0-9]+]], $6, $[[T1]]
+; MMR3: lw       $[[T4:[0-9]+]], 52($sp)
+; MMR3: sltu     $[[T5:[0-9]+]], $7, $[[T4]]
+; MMR3: movz     $[[T6:[0-9]+]], $[[T5]], $[[T3]]
+; MMR3: lw       $[[T7:[0-8]+]], 44($sp)
+; MMR3: subu16   $[[T8:[0-9]+]], $5, $[[T7]]
+; MMR3: subu16   $[[T9:[0-9]+]], $[[T8]], $[[T6]]
+; MMR3: sltu     $[[T10:[0-9]+]], $[[T8]], $[[T2]]
+; MMR3: sltu     $[[T11:[0-9]+]], $5, $[[T7]]
+; MMR3: lw       $[[T12:[0-9]+]], 40($sp)
+; MMR3: lw       $[[T13:[0-9]+]], 12($sp)
+; MMR3: subu16   $[[T14:[0-9]+]], $[[T13]], $[[T12]]
+; MMR3: subu16   $[[T15:[0-9]+]], $[[T14]], $[[T11]]
+; MMR3: subu16   $[[T16:[0-9]+]], $[[T15]], $[[T10]]
+; MMR3: subu16   $[[T17:[0-9]+]], $6, $[[T1]]
+; MMR3: subu16   $[[T18:[0-9]+]], $[[T17]], $7
+; MMR3: lw       $[[T19:[0-9]+]], 8($sp)
+; MMR3: lw       $[[T20:[0-9]+]], 0($sp)
+; MMR3: subu16   $5, $[[T19]], $[[T20]]
+
+; MMR6: move     $[[T0:[0-9]+]], $7
+; MMR6: sw       $[[T0]], 8($sp)
+; MMR6: move     $[[T1:[0-9]+]], $5
+; MMR6: sw       $4, 12($sp)
+; MMR6: lw       $[[T2:[0-9]+]], 48($sp)
+; MMR6: sltu     $[[T3:[0-9]+]], $6, $[[T2]]
+; MMR6: xor      $[[T4:[0-9]+]], $6, $[[T2]]
+; MMR6: sltiu    $[[T5:[0-9]+]], $[[T4]], 1
+; MMR6: seleqz   $[[T6:[0-9]+]], $[[T3]], $[[T5]]
+; MMR6: lw       $[[T7:[0-9]+]], 52($sp)
+; MMR6: sltu     $[[T8:[0-9]+]], $[[T0]], $[[T7]]
+; MMR6: selnez   $[[T9:[0-9]+]], $[[T8]], $[[T5]]
+; MMR6: or       $[[T10:[0-9]+]], $[[T9]], $[[T6]]
+; MMR6: lw       $[[T11:[0-9]+]], 44($sp)
+; MMR6: subu16   $[[T12:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: subu16   $[[T13:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu     $[[T16:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu     $[[T17:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: lw       $[[T18:[0-9]+]], 40($sp)
+; MMR6: lw       $[[T19:[0-9]+]], 12($sp)
+; MMR6: subu16   $[[T20:[0-9]+]], $[[T19]], $[[T18]]
+; MMR6: subu16   $[[T21:[0-9]+]], $[[T20]], $[[T17]]
+; MMR6: subu16   $[[T22:[0-9]+]], $[[T21]], $[[T16]]
+; MMR6: subu16   $[[T23:[0-9]+]], $6, $[[T2]]
+; MMR6: subu16   $4, $[[T23]], $5
+; MMR6: lw       $[[T24:[0-9]+]], 8($sp)
+; MMR6: lw       $[[T25:[0-9]+]], 0($sp)
+; MMR6: subu16   $5, $[[T24]], $[[T25]]
+; MMR6: lw       $3, 4($sp)
+
+; FIXME: The sltu, dsll, dsrl pattern here occurs when an i32 is zero
+;        extended to 64 bits. Fortunately slt(i)(u) actually gives an i1.
+;        These should be combined away.
+
+; GP64-NOT-R2: dsubu     $1, $4, $6
+; GP64-NOT-R2: sltu      $[[T0:[0-9]+]], $5, $7
+; GP64-NOT-R2: dsll      $[[T1:[0-9]+]], $[[T0]], 32
+; GP64-NOT-R2: dsrl      $[[T2:[0-9]+]], $[[T1]], 32
+; GP64-NOT-R2: dsubu     $2, $1, $[[T2]]
+; GP64-NOT-R2: dsubu     $3, $5, $7
+
+; FIXME: Likewise for the sltu, dext here.
+
+; GP64-R2:     dsubu     $1, $4, $6
+; GP64-R2:     sltu      $[[T0:[0-9]+]], $5, $7
+; GP64-R2:     dext      $[[T1:[0-9]+]], $[[T0]], 0, 32
+; GP64-R2:     dsubu     $2, $1, $[[T1]]
+; GP64-R2:     dsubu     $3, $5, $7
+
+; FIXME: Again, redundant sign extension. Also, microMIPSR6 has the
+;        dext instruction which should be used here.
+
+; MM64: dsubu   $[[T0:[0-9]+]], $4, $6
+; MM64: sltu    $[[T1:[0-9]+]], $5, $7
+; MM64: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+; MM64: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+; MM64: dsubu   $2, $[[T0]], $[[T3]]
+; MM64: dsubu   $3, $5, $7
+; MM64: jr      $ra
 
   %r = sub i128 %a, %b
   ret i128 %r
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 11bc6d390319d..c616089c6df02 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -1,17 +1,17 @@
-; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -march=mipsel -force-mips-long-branch -O3 -relocation-model=pic < %s \
+; RUN: llc -march=mipsel -relocation-model=pic < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=mipsel -force-mips-long-branch -O3 -relocation-model=pic < %s -verify-machineinstrs \
 ; RUN:   | FileCheck %s -check-prefix=O32
 ; RUN: llc -march=mipsel -mcpu=mips32r6 -force-mips-long-branch -O3 \
-; RUN:   -relocation-model=pic -asm-show-inst < %s | FileCheck %s -check-prefix=O32-R6
+; RUN:   -relocation-model=pic -asm-show-inst < %s -verify-machineinstrs | FileCheck %s -check-prefix=O32-R6
 ; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 -force-mips-long-branch -O3 -relocation-model=pic \
-; RUN:   < %s | FileCheck %s -check-prefix=N64
+; RUN:   < %s -verify-machineinstrs | FileCheck %s -check-prefix=N64
 ; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -force-mips-long-branch -O3 -relocation-model=pic \
-; RUN:   < %s | FileCheck %s -check-prefix=N64
+; RUN:   < %s -verify-machineinstrs | FileCheck %s -check-prefix=N64
 ; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 -force-mips-long-branch -O3 \
-; RUN:   -relocation-model=pic -asm-show-inst < %s | FileCheck %s -check-prefix=N64-R6
+; RUN:   -relocation-model=pic -asm-show-inst < %s -verify-machineinstrs | FileCheck %s -check-prefix=N64-R6
 ; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=micromips \
-; RUN:   -force-mips-long-branch -O3 -relocation-model=pic < %s | FileCheck %s -check-prefix=MICROMIPS
-; RUN: llc -mtriple=mipsel-none-nacl -force-mips-long-branch -O3 -relocation-model=pic < %s \
+; RUN:   -force-mips-long-branch -O3 -relocation-model=pic < %s -verify-machineinstrs | FileCheck %s -check-prefix=MICROMIPS
+; RUN: llc -mtriple=mipsel-none-nacl -force-mips-long-branch -O3 -relocation-model=pic < %s -verify-machineinstrs \
 ; RUN:   | FileCheck %s -check-prefix=NACL
 
 
@@ -59,9 +59,9 @@ end:
 ; Check for long branch expansion:
 ; O32:             addiu   $sp, $sp, -8
 ; O32-NEXT:        sw      $ra, 0($sp)
-; O32-NEXT:        lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; O32-NEXT:        lui     $1, %hi(($BB0_[[BB2:[0-9]+]])-($[[BB1:BB[0-9_]+]]))
 ; O32-NEXT:        bal     $[[BB1]]
-; O32-NEXT:        addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; O32-NEXT:        addiu   $1, $1, %lo(($BB0_[[BB2]])-($[[BB1]]))
 ; O32-NEXT:   $[[BB1]]:
 ; O32-NEXT:        addu    $1, $ra, $1
 ; O32-NEXT:        lw      $ra, 0($sp)
@@ -72,7 +72,7 @@ end:
 ; O32:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
 ; O32:        addiu   $[[R2:[0-9]+]], $zero, 1
 ; O32:        sw      $[[R2]], 0($[[R1]])
-; O32:   $[[BB2]]:
+; O32:   # BB#[[BB2]]:
 ; O32:        jr      $ra
 ; O32:        nop
 
@@ -90,10 +90,10 @@ end:
 ; Check for long branch expansion:
 ; N64:           daddiu  $sp, $sp, -16
 ; N64-NEXT:      sd      $ra, 0($sp)
-; N64-NEXT:      daddiu  $1, $zero, %hi([[BB2:\.LBB[0-9_]+]]-[[BB1:\.LBB[0-9_]+]])
+; N64-NEXT:      daddiu  $1, $zero, %hi(.LBB0_[[BB2:[0-9_]+]]-[[BB1:\.LBB[0-9_]+]])
 ; N64-NEXT:      dsll    $1, $1, 16
 ; N64-NEXT:      bal     [[BB1]]
-; N64-NEXT:      daddiu  $1, $1, %lo([[BB2]]-[[BB1]])
+; N64-NEXT:      daddiu  $1, $1, %lo(.LBB0_[[BB2]]-[[BB1]])
 ; N64-NEXT:  [[BB1]]:
 ; N64-NEXT:      daddu   $1, $ra, $1
 ; N64-NEXT:      ld      $ra, 0($sp)
@@ -105,7 +105,7 @@ end:
 ; N64:        addiu   $[[R3:[0-9]+]], $zero, 1
 ; N64:        ld      $[[R2:[0-9]+]], %got_disp(x)($[[GP]])
 ; N64:        sw      $[[R3]], 0($[[R2]])
-; N64:   [[BB2]]:
+; N64:   # BB#[[BB2]]:
 ; N64:        jr      $ra
 ; N64:        nop
 
@@ -125,9 +125,9 @@ end:
 ; Check for long branch expansion:
 ; MICROMIPS:          addiu   $sp, $sp, -8
 ; MICROMIPS-NEXT:     sw      $ra, 0($sp)
-; MICROMIPS-NEXT:     lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; MICROMIPS-NEXT:     lui     $1, %hi(($BB0_[[BB2:[0-9]+]])-($[[BB1:BB[0-9_]+]]))
 ; MICROMIPS-NEXT:     bal     $[[BB1]]
-; MICROMIPS-NEXT:     addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; MICROMIPS-NEXT:     addiu   $1, $1, %lo(($BB0_[[BB2]])-($[[BB1]]))
 ; MICROMIPS-NEXT:  $[[BB1]]:
 ; MICROMIPS-NEXT:     addu    $1, $ra, $1
 ; MICROMIPS-NEXT:     lw      $ra, 0($sp)
@@ -138,7 +138,7 @@ end:
 ; MICROMIPS:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
 ; MICROMIPS:        li16    $[[R2:[0-9]+]], 1
 ; MICROMIPS:        sw16    $[[R2]], 0($[[R1]])
-; MICROMIPS:   $[[BB2]]:
+; MICROMIPS:   # BB#[[BB2]]:
 ; MICROMIPS:        jrc      $ra
 
 
@@ -154,9 +154,9 @@ end:
 ; Check for long branch expansion:
 ; NACL:             addiu   $sp, $sp, -8
 ; NACL-NEXT:        sw      $ra, 0($sp)
-; NACL-NEXT:        lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; NACL-NEXT:        lui     $1, %hi(($BB0_[[BB2:[0-9]+]])-($[[BB1:BB[0-9_]+]]))
 ; NACL-NEXT:        bal     $[[BB1]]
-; NACL-NEXT:        addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; NACL-NEXT:        addiu   $1, $1, %lo(($BB0_[[BB2]])-($[[BB1]]))
 ; NACL-NEXT:   $[[BB1]]:
 ; NACL-NEXT:        addu    $1, $ra, $1
 ; NACL-NEXT:        lw      $ra, 0($sp)
@@ -169,7 +169,7 @@ end:
 ; NACL:             addiu   $[[R2:[0-9]+]], $zero, 1
 ; NACL:             sw      $[[R2]], 0($[[R1]])
 ; NACL:             .p2align  4
-; NACL-NEXT:   $[[BB2]]:
+; NACL-NEXT:   # BB#[[BB2]]:
 ; NACL:             jr      $ra
 ; NACL:             nop
 }
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 7baba005a0729..3e1a2e8b97088 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -25,11 +25,11 @@
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
-; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      muh  $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sra  $[[T4:[0-9]+]], $6, 31
+; 32R6-DAG:      addu $[[T5:[0-9]+]], $[[T3]], $[[T4]]
+; 32R6-DAG:      addu $2, $[[T5]], $[[T2]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -71,7 +71,7 @@ entry:
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
 ; FIXME: There's a redundant move here. We should remove it
 ; 32R6-DAG:      muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $2, $[[T3]], $[[T2]]
@@ -109,10 +109,10 @@ entry:
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $7
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $7
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $6
-; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $1
+; 32R6-DAG:      muh  $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $6
+; 32R6-DAG:      addu $2, $[[T4]], $[[T2]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -134,6 +134,17 @@ entry:
   ret i64 %add
 }
 
+; ALL-LABEL: madd4
+; ALL-NOT: madd ${{[0-9]+}}, ${{[0-9]+}}
+
+define i32 @madd4(i32 %a, i32 %b, i32 %c) {
+entry:
+  %mul = mul nsw i32 %a, %b
+  %add = add nsw i32 %c, %mul
+
+  ret i32 %add
+}
+
 ; ALL-LABEL: msub1:
 
 ; 32-DAG:        sra $[[T0:[0-9]+]], $6, 31
@@ -148,13 +159,13 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muh  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul  $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      sltu $[[T3:[0-9]+]], $6, $[[T1]]
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $[[T0]]
-; 32R6-DAG:      sra  $[[T5:[0-9]+]], $6, 31
-; 32R6-DAG:      subu $2, $[[T5]], $[[T4]]
-; 32R6-DAG:      subu $3, $6, $[[T1]]
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $6, $[[T0]]
+; 32R6-DAG:      muh  $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
+; 32R6-DAG:      subu $[[T4:[0-9]+]], $[[T3]], $[[T2]]
+; 32R6-DAG:      subu $2, $[[T4]], $[[T1]]
+; 32R6-DAG:      subu $3, $6, $[[T0]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -194,13 +205,12 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muhu $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $6, $[[T1]]
-; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
-; 32R6-DAG:      negu $2, $[[T3]]
-; 32R6-DAG:      subu $3, $6, $[[T1]]
+; 32R6-DAG:      mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $6, $[[T0]]
+; 32R6-DAG:      muhu $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      negu $[[T3:[0-9]+]], $[[T2]]
+; 32R6-DAG:      subu $2, $[[T3]], $[[T1]]
+; 32R6-DAG:      subu $3, $6, $[[T0]]
 
 ; 64-DAG:        d[[m:m]]ult $5, $4
 ; 64-DAG:        [[m]]flo $[[T0:[0-9]+]]
@@ -234,12 +244,12 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muh $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $7, $[[T1]]
-; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
-; 32R6-DAG:      subu $2, $6, $[[T3]]
-; 32R6-DAG:      subu $3, $7, $[[T1]]
+; 32R6-DAG:      mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $7, $[[T0]]
+; 32R6-DAG:      muh $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      subu $[[T3:[0-9]+]], $6, $[[T2]]
+; 32R6-DAG:      subu $2, $[[T3]], $[[T1]]
+; 32R6-DAG:      subu $3, $7, $[[T0]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -260,3 +270,14 @@ entry:
   %sub = sub nsw i64 %c, %mul
   ret i64 %sub
 }
+
+; ALL-LABEL: msub4
+; ALL-NOT: msub ${{[0-9]+}}, ${{[0-9]+}}
+
+define i32 @msub4(i32 %a, i32 %b, i32 %c) {
+entry:
+  %mul = mul nsw i32 %a, %b
+  %sub = sub nsw i32 %c, %mul
+
+  ret i32 %sub
+}
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 2039c1f57f17e..f402cb78bd189 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -109,7 +109,7 @@ entry:
   %tmp = load atomic i64, i64* %mem acquire, align 64
 ; CHECK-NOT: ldarx
 ; CHECK: ld [[VAL:r[0-9]+]]
-; CHECK: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK: cmpd [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
 ; CHECK: bne- [[CR]], .+4
 ; CHECK: isync
   ret i64 %tmp
diff --git a/test/CodeGen/PowerPC/atomics-constant.ll b/test/CodeGen/PowerPC/atomics-constant.ll
index a92ca813af857..77825c608a3bb 100644
--- a/test/CodeGen/PowerPC/atomics-constant.ll
+++ b/test/CodeGen/PowerPC/atomics-constant.ll
@@ -11,7 +11,7 @@ define i64 @foo() {
 ; CHECK-NEXT:    addis 3, 2, .LC0@toc@ha
 ; CHECK-NEXT:    li 4, 0
 ; CHECK-NEXT:    ld 3, .LC0@toc@l(3)
-; CHECK-NEXT:    cmpw 7, 4, 4
+; CHECK-NEXT:    cmpd 7, 4, 4
 ; CHECK-NEXT:    ld 3, 0(3)
 ; CHECK-NEXT:    bne- 7, .+4
 ; CHECK-NEXT:    isync
diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll
index 054d3a4146b03..d57b3a203791c 100644
--- a/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@@ -23,7 +23,7 @@ define i8 @test2(i8* %ptr) {
 ; PPC64LE-LABEL: test2:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lbz 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -37,7 +37,7 @@ define i8 @test3(i8* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    lbz 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -67,7 +67,7 @@ define i16 @test6(i16* %ptr) {
 ; PPC64LE-LABEL: test6:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lhz 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -81,7 +81,7 @@ define i16 @test7(i16* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    lhz 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -111,7 +111,7 @@ define i32 @test10(i32* %ptr) {
 ; PPC64LE-LABEL: test10:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lwz 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -125,7 +125,7 @@ define i32 @test11(i32* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    lwz 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -155,7 +155,7 @@ define i64 @test14(i64* %ptr) {
 ; PPC64LE-LABEL: test14:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    ld 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -169,7 +169,7 @@ define i64 @test15(i64* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    ld 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
@@ -9566,7 +9566,7 @@ define i32 @test_ordering0(i32* %ptr1, i32* %ptr2) {
 ; PPC64LE-LABEL: test_ordering0:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lwz 4, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 4, 4
+; PPC64LE-NEXT:    cmpd 7, 4, 4
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    lwz 3, 0(3)
@@ -9583,7 +9583,7 @@ define i32 @test_ordering1(i32* %ptr1, i32 %val1, i32* %ptr2) {
 ; PPC64LE-LABEL: test_ordering1:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lwz 3, 0(3)
-; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    cmpd 7, 3, 3
 ; PPC64LE-NEXT:    bne- 7, .+4
 ; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    stw 4, 0(5)
diff --git a/test/CodeGen/PowerPC/licm-tocReg.ll b/test/CodeGen/PowerPC/licm-tocReg.ll
new file mode 100644
index 0000000000000..ecdfcba6e3b7a
--- /dev/null
+++ b/test/CodeGen/PowerPC/licm-tocReg.ll
@@ -0,0 +1,110 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; The instructions ADDIStocHA/LDtocL are used to calculate the address of 
+; globals. The ones that are in bb.3.if.end could not be hoisted by Machine 
+; LICM due to BCTRL_LDinto_toc in bb2.if.then.  This call causes the compiler
+; to insert a save TOC to stack before the call and load into X2 to restore TOC
+; after. By communicating to Machine LICM that X2 is guaranteed to have the 
+; same value before and after BCTRL_LDinto_toc, these instructions can be
+; hoisted out of bb.3.if.end to outside of the loop.
+
+; Pre Machine LICM MIR
+;
+;body:             
+;  bb.0.entry:
+;    successors: %bb.2.if.then(0x40000000), %bb.3.if.end(0x40000000)
+;    liveins: %x3
+;  
+;    %4 = COPY %x3
+;    %5 = ADDIStocHA %x2, @ga
+;    %6 = LDtocL @ga, killed %5 :: (load 8 from got)
+;    %7 = LWZ 0, %6 :: (volatile dereferenceable load 4 from @ga)
+;    %8 = ADDIStocHA %x2, @gb
+;    %9 = LDtocL @gb, killed %8 :: (load 8 from got)
+;    %10 = LWZ 0, killed %9 :: (volatile dereferenceable load 4 from @gb)
+;    %0 = LWZ 0, %6 :: (volatile dereferenceable load 4 from @ga)
+;    %11 = CMPW killed %7, killed %10
+;    BCC 44, killed %11, %bb.2.if.then
+;    B %bb.3.if.end
+;  
+;  bb.2.if.then:
+;    %1 = PHI %0, %bb.0.entry, %3, %bb.3.if.end
+;    ADJCALLSTACKDOWN 32, 0, implicit-def dead %r1, implicit %r1
+;    %20 = COPY %x2
+;    STD %20, 24, %x1 :: (store 8 into stack + 24)
+;    %21 = EXTSW_32_64 %1
+;    %x3 = COPY %21
+;    %x12 = COPY %4
+;    MTCTR8 %4, implicit-def %ctr8
+;    BCTRL8_LDinto_toc 24, %x1, csr_svr464_altivec, implicit-def dead %lr8, implicit-def dead %x2, implicit %ctr8, implicit %rm, implicit %x3, implicit %x12, implicit %x2, implicit-def %r1, implicit-def %x3
+;    ADJCALLSTACKUP 32, 0, implicit-def dead %r1, implicit %r1
+;    %22 = COPY %x3
+;    %x3 = COPY %22
+;    BLR8 implicit %lr8, implicit %rm, implicit %x3
+;  
+;  bb.3.if.end:
+;    successors: %bb.2.if.then(0x04000000), %bb.3.if.end(0x7c000000)
+;  
+;    %2 = PHI %0, %bb.0.entry, %3, %bb.3.if.end
+;    %12 = ADDI %2, 1
+;    %13 = ADDIStocHA %x2, @ga
+;    %14 = LDtocL @ga, killed %13 :: (load 8 from got)
+;    STW killed %12, 0, %14 :: (volatile store 4 into @ga)
+;    %15 = LWZ 0, %14 :: (volatile dereferenceable load 4 from @ga)
+;    %16 = ADDIStocHA %x2, @gb
+;    %17 = LDtocL @gb, killed %16 :: (load 8 from got)
+;    %18 = LWZ 0, killed %17 :: (volatile dereferenceable load 4 from @gb)
+;    %3 = LWZ 0, %14 :: (volatile dereferenceable load 4 from @ga)
+;    %19 = CMPW killed %15, killed %18
+;    BCC 44, killed %19, %bb.2.if.then
+;    B %bb.3.if.end
+
+@ga = external global i32, align 4
+@gb = external global i32, align 4
+
+; Function Attrs: nounwind
+define signext i32 @test(i32 (i32)* nocapture %FP) local_unnamed_addr #0 {
+; CHECK-LABEL: test:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis 4, 2, .LC0@toc@ha
+; CHECK-NEXT:    addis 5, 2, .LC1@toc@ha
+; CHECK-NEXT:    ld 4, .LC0@toc@l(4)
+; CHECK-NEXT:    ld 5, .LC1@toc@l(5)
+; CHECK-NEXT:    lwz 6, 0(4)
+; CHECK-NEXT:    lwz 5, 0(5)
+; CHECK-NEXT:    cmpw 6, 5
+; CHECK-NEXT:    lwz 5, 0(4)
+; CHECK-NEXT:    mr 4, 3
+; CHECK-NEXT:    bgt 0, .LBB0_3
+; CHECK-NEXT:  # BB#1:
+; CHECK-NEXT:    addis 3, 2, .LC0@toc@ha
+; CHECK-NEXT:    addis 6, 2, .LC1@toc@ha
+; CHECK-NEXT:    ld 3, .LC0@toc@l(3)
+; CHECK-NEXT:    ld 6, .LC1@toc@l(6)
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB0_2: # %if.end
+; CHECK-NOT:    addis {{[0-9]+}}, 2, .LC0@toc@ha
+; CHECK-NOT:    addis {{[0-9]+}}, 2, .LC1@toc@ha
+; CHECK:    blr
+entry:
+  %0 = load volatile i32, i32* @ga, align 4
+  %1 = load volatile i32, i32* @gb, align 4
+  %cmp1 = icmp sgt i32 %0, %1
+  %2 = load volatile i32, i32* @ga, align 4
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %if.end, %entry
+  %.lcssa = phi i32 [ %2, %entry ], [ %6, %if.end ]
+  %call = tail call signext i32 %FP(i32 signext %.lcssa) #1
+  ret i32 %call
+
+if.end:                                           ; preds = %entry, %if.end
+  %3 = phi i32 [ %6, %if.end ], [ %2, %entry ]
+  %inc = add nsw i32 %3, 1
+  store volatile i32 %inc, i32* @ga, align 4
+  %4 = load volatile i32, i32* @ga, align 4
+  %5 = load volatile i32, i32* @gb, align 4
+  %cmp = icmp sgt i32 %4, %5
+  %6 = load volatile i32, i32* @ga, align 4
+  br i1 %cmp, label %if.then, label %if.end
+}
diff --git a/test/CodeGen/PowerPC/logic-ops-on-compares.ll b/test/CodeGen/PowerPC/logic-ops-on-compares.ll
index 5a507e9ff678c..df021c20ea86e 100644
--- a/test/CodeGen/PowerPC/logic-ops-on-compares.ll
+++ b/test/CodeGen/PowerPC/logic-ops-on-compares.ll
@@ -40,8 +40,8 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %retval.0
 }
 
-define void @neg_truncate_i32_eq(i32 *%ptr) {
-; CHECK-LABEL: neg_truncate_i32_eq:
+define void @neg_truncate_i32(i32 *%ptr) {
+; CHECK-LABEL: neg_truncate_i32:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    lwz r3, 0(r3)
 ; CHECK-NEXT:    rldicl. r3, r3, 0, 63
@@ -66,8 +66,8 @@ if.end29:                                         ; preds = %if.else
 }
 
 ; Function Attrs: nounwind
-define i64 @logic_eq_64(i64 %a, i64 %b, i64 %c) {
-; CHECK-LABEL: logic_eq_64:
+define i64 @logic_ne_64(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: logic_ne_64:
 ; CHECK:    xor r7, r3, r4
 ; CHECK-NEXT:    li r6, 55
 ; CHECK-NEXT:    xor r5, r5, r6
@@ -99,8 +99,8 @@ return:                                           ; preds = %if.end, %if.then
   ret i64 %retval.0
 }
 
-define void @neg_truncate_i64_eq(i64 *%ptr) {
-; CHECK-LABEL: neg_truncate_i64_eq:
+define void @neg_truncate_i64(i64 *%ptr) {
+; CHECK-LABEL: neg_truncate_i64:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    ld r3, 0(r3)
 ; CHECK-NEXT:    rldicl. r3, r3, 0, 63
@@ -124,67 +124,6 @@ if.end29:                                         ; preds = %if.else
 
 }
 
-; Function Attrs: nounwind
-define i64 @logic_ne_64(i64 %a, i64 %b, i64 %c) {
-; CHECK-LABEL: logic_ne_64:
-; CHECK:    xor r7, r3, r4
-; CHECK-NEXT:    li r6, 55
-; CHECK-NEXT:    addic r8, r7, -1
-; CHECK-NEXT:    xor r5, r5, r6
-; CHECK-NEXT:    subfe r7, r8, r7
-; CHECK-NEXT:    cntlzd r5, r5
-; CHECK-NEXT:    addic r12, r4, -1
-; CHECK-NEXT:    rldicl r5, r5, 58, 63
-; CHECK-NEXT:    subfe r6, r12, r4
-; CHECK-NEXT:    and r6, r7, r6
-; CHECK-NEXT:    or. r5, r6, r5
-; CHECK-NEXT:    bc 4, 1
-entry:
-  %tobool = icmp ne i64 %a, %b
-  %tobool1 = icmp ne i64 %b, 0
-  %or.cond = and i1 %tobool, %tobool1
-  %tobool3 = icmp eq i64 %c, 55
-  %or.cond5 = or i1 %or.cond, %tobool3
-  br i1 %or.cond5, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call = tail call i64 @foo64(i64 %a) #2
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %call4 = tail call i64 @bar64(i64 %b) #2
-  br label %return
-
-return:                                           ; preds = %if.end, %if.then
-  %retval.0 = phi i64 [ %call4, %if.end ], [ %call, %if.then ]
-  ret i64 %retval.0
-}
-
-define void @neg_truncate_i64_ne(i64 *%ptr) {
-; CHECK-LABEL: neg_truncate_i64_ne:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    ld r3, 0(r3)
-; CHECK-NEXT:    andi. r3, r3, 1
-; CHECK-NEXT:    bclr 12, 1, 0
-; CHECK-NEXT:  # BB#1: # %if.end29.thread136
-; CHECK-NEXT:  .LBB5_2: # %if.end29
-entry:
-  %0 = load i64, i64* %ptr, align 4
-  %rem17127 = and i64 %0, 1
-  %cmp18 = icmp ne i64 %rem17127, 0
-  br label %if.else
-
-if.else:                                          ; preds = %entry
-  br i1 %cmp18, label %if.end29, label %if.end29.thread136
-
-if.end29.thread136:                               ; preds = %if.else
-  unreachable
-
-if.end29:                                         ; preds = %if.else
-  ret void
-
-}
-
 declare signext i32 @foo(i32 signext)
 declare signext i32 @bar(i32 signext)
 declare i64 @foo64(i64)
diff --git a/test/CodeGen/PowerPC/ppc64-P9-mod.ll b/test/CodeGen/PowerPC/ppc64-P9-mod.ll
new file mode 100644
index 0000000000000..46e347becbb67
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-P9-mod.ll
@@ -0,0 +1,263 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-PWR8 -implicit-check-not mod[us][wd]
+
+@mod_resultsw = common local_unnamed_addr global i32 0, align 4
+@mod_resultud = common local_unnamed_addr global i64 0, align 8
+@div_resultsw = common local_unnamed_addr global i32 0, align 4
+@mod_resultuw = common local_unnamed_addr global i32 0, align 4
+@div_resultuw = common local_unnamed_addr global i32 0, align 4
+@div_resultsd = common local_unnamed_addr global i64 0, align 8
+@mod_resultsd = common local_unnamed_addr global i64 0, align 8
+@div_resultud = common local_unnamed_addr global i64 0, align 8
+
+; Function Attrs: norecurse nounwind
+define void @modulo_sw(i32 signext %a, i32 signext %b) local_unnamed_addr {
+entry:
+  %rem = srem i32 %a, %b
+  store i32 %rem, i32* @mod_resultsw, align 4
+  ret void
+; CHECK-LABEL: modulo_sw
+; CHECK: modsw {{[0-9]+}}, 3, 4
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_sw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i32 @modulo_uw(i32 zeroext %a, i32 zeroext %b) local_unnamed_addr {
+entry:
+  %rem = urem i32 %a, %b
+  ret i32 %rem
+; CHECK-LABEL: modulo_uw
+; CHECK: moduw {{[0-9]+}}, 3, 4
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_uw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @modulo_sd(i64 %a, i64 %b) local_unnamed_addr {
+entry:
+  %rem = srem i64 %a, %b
+  ret i64 %rem
+; CHECK-LABEL: modulo_sd
+; CHECK: modsd {{[0-9]+}}, 3, 4
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_sd
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+define void @modulo_ud(i64 %a, i64 %b) local_unnamed_addr {
+entry:
+  %rem = urem i64 %a, %b
+  store i64 %rem, i64* @mod_resultud, align 8
+  ret void
+; CHECK-LABEL: modulo_ud
+; CHECK: modud {{[0-9]+}}, 3, 4
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_ud
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+define void @modulo_div_sw(i32 signext %a, i32 signext %b) local_unnamed_addr {
+entry:
+  %rem = srem i32 %a, %b
+  store i32 %rem, i32* @mod_resultsw, align 4
+  %div = sdiv i32 %a, %b
+  store i32 %div, i32* @div_resultsw, align 4
+  ret void
+; CHECK-LABEL: modulo_div_sw
+; CHECK-NOT: modsw
+; CHECK: div
+; CHECK-NOT: modsw
+; CHECK: mull
+; CHECK-NOT: modsw
+; CHECK: sub
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_div_sw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+define void @modulo_div_abc_sw(i32 signext %a, i32 signext %b, i32 signext %c) local_unnamed_addr {
+entry:
+  %rem = srem i32 %a, %c
+  store i32 %rem, i32* @mod_resultsw, align 4
+  %div = sdiv i32 %b, %c
+  store i32 %div, i32* @div_resultsw, align 4
+  ret void
+; CHECK-LABEL: modulo_div_abc_sw
+; CHECK: modsw {{[0-9]+}}, 3, 5
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_div_abc_sw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+define void @modulo_div_uw(i32 zeroext %a, i32 zeroext %b) local_unnamed_addr {
+entry:
+  %rem = urem i32 %a, %b
+  store i32 %rem, i32* @mod_resultuw, align 4
+  %div = udiv i32 %a, %b
+  store i32 %div, i32* @div_resultuw, align 4
+  ret void
+; CHECK-LABEL: modulo_div_uw
+; CHECK-NOT: modsw
+; CHECK: div
+; CHECK-NOT: modsw
+; CHECK: mull
+; CHECK-NOT: modsw
+; CHECK: sub
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_div_uw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+define void @modulo_div_swuw(i32 signext %a, i32 signext %b) local_unnamed_addr {
+entry:
+  %rem = srem i32 %a, %b
+  store i32 %rem, i32* @mod_resultsw, align 4
+  %div = udiv i32 %a, %b
+  store i32 %div, i32* @div_resultsw, align 4
+  ret void
+; CHECK-LABEL: modulo_div_swuw
+; CHECK: modsw {{[0-9]+}}, 3, 4
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_div_swuw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+define void @modulo_div_udsd(i64 %a, i64 %b) local_unnamed_addr {
+entry:
+  %rem = urem i64 %a, %b
+  store i64 %rem, i64* @mod_resultud, align 8
+  %div = sdiv i64 %a, %b
+  store i64 %div, i64* @div_resultsd, align 8
+  ret void
+; CHECK-LABEL: modulo_div_udsd
+; CHECK: modud {{[0-9]+}}, 3, 4
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_div_udsd
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+define void @modulo_const32_sw(i32 signext %a) local_unnamed_addr {
+entry:
+  %rem = srem i32 %a, 32
+  store i32 %rem, i32* @mod_resultsw, align 4
+  ret void
+; CHECK-LABEL: modulo_const32_sw
+; CHECK-NOT: modsw
+; CHECK: srawi
+; CHECK-NOT: modsw
+; CHECK: addze
+; CHECK-NOT: modsw
+; CHECK: slwi
+; CHECK-NOT: modsw
+; CHECK: subf
+; CHECK-NOT: modsw
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_const32_sw
+; CHECK-PWR8: srawi
+; CHECK-PWR8: addze
+; CHECK-PWR8: slwi
+; CHECK-PWR8: subf
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @modulo_const3_sw(i32 signext %a) local_unnamed_addr {
+entry:
+  %rem = srem i32 %a, 3
+  ret i32 %rem
+; CHECK-LABEL: modulo_const3_sw
+; CHECK-NOT: modsw
+; CHECK: mull
+; CHECK-NOT: modsw
+; CHECK: sub
+; CHECK-NOT: modsw
+; CHECK: blr
+; CHECK-PWR8-LABEL: modulo_const3_sw
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @const2_modulo_sw(i32 signext %a) local_unnamed_addr {
+entry:
+  %rem = srem i32 2, %a
+  ret i32 %rem
+; CHECK-LABEL: const2_modulo_sw
+; CHECK: modsw {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK: blr
+; CHECK-PWR8-LABEL: const2_modulo_sw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+; Function Attrs: norecurse nounwind
+; FIXME On power 9 this test will still produce modsw because the divide is in
+; a different block than the remainder. Due to the nature of the SDAG we cannot
+; see the div in the other block.
+define void @blocks_modulo_div_sw(i32 signext %a, i32 signext %b, i32 signext %c) local_unnamed_addr {
+entry:
+  %div = sdiv i32 %a, %b
+  store i32 %div, i32* @div_resultsw, align 4
+  %cmp = icmp sgt i32 %c, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %a, %b
+  store i32 %rem, i32* @mod_resultsw, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+; CHECK-LABEL: blocks_modulo_div_sw
+; CHECK: div
+; CHECK: modsw {{[0-9]+}}, 3, 4
+; CHECK: blr
+; CHECK-PWR8-LABEL: blocks_modulo_div_sw
+; CHECK-PWR8: div
+; CHECK-PWR8: mull
+; CHECK-PWR8: sub
+; CHECK-PWR8: blr
+}
+
+
diff --git a/test/CodeGen/PowerPC/testComparesinesll.ll b/test/CodeGen/PowerPC/testComparesinesll.ll
deleted file mode 100644
index 9e93694558579..0000000000000
--- a/test/CodeGen/PowerPC/testComparesinesll.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
-@glob = common local_unnamed_addr global i64 0, align 8
-
-define signext i32 @test_inesll(i64 %a, i64 %b) {
-; CHECK-LABEL: test_inesll:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define signext i32 @test_inesll_sext(i64 %a, i64 %b) {
-; CHECK-LABEL: test_inesll_sext:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %sub = sext i1 %cmp to i32
-  ret i32 %sub
-}
-
-define signext i32 @test_inesll_z(i64 %a) {
-; CHECK-LABEL: test_inesll_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define signext i32 @test_inesll_sext_z(i64 %a) {
-; CHECK-LABEL: test_inesll_sext_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %sub = sext i1 %cmp to i32
-  ret i32 %sub
-}
-
-define void @test_inesll_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_inesll_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_inesll_sext_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_inesll_sext_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_inesll_z_store(i64 %a) {
-; CHECK-LABEL: test_inesll_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_inesll_sext_z_store(i64 %a) {
-; CHECK-LABEL: test_inesll_sext_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
diff --git a/test/CodeGen/PowerPC/testComparesineull.ll b/test/CodeGen/PowerPC/testComparesineull.ll
deleted file mode 100644
index 7f0fed15157c5..0000000000000
--- a/test/CodeGen/PowerPC/testComparesineull.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
-@glob = common local_unnamed_addr global i64 0, align 8
-
-define signext i32 @test_ineull(i64 %a, i64 %b) {
-; CHECK-LABEL: test_ineull:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define signext i32 @test_ineull_sext(i64 %a, i64 %b) {
-; CHECK-LABEL: test_ineull_sext:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %sub = sext i1 %cmp to i32
-  ret i32 %sub
-}
-
-define signext i32 @test_ineull_z(i64 %a) {
-; CHECK-LABEL: test_ineull_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define signext i32 @test_ineull_sext_z(i64 %a) {
-; CHECK-LABEL: test_ineull_sext_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %sub = sext i1 %cmp to i32
-  ret i32 %sub
-}
-
-define void @test_ineull_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_ineull_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_ineull_sext_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_ineull_sext_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_ineull_z_store(i64 %a) {
-; CHECK-LABEL: test_ineull_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_ineull_sext_z_store(i64 %a) {
-; CHECK-LABEL: test_ineull_sext_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
diff --git a/test/CodeGen/PowerPC/testComparesllnesll.ll b/test/CodeGen/PowerPC/testComparesllnesll.ll
deleted file mode 100644
index d87ff55739fc8..0000000000000
--- a/test/CodeGen/PowerPC/testComparesllnesll.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
-@glob = common local_unnamed_addr global i64 0, align 8
-
-define i64 @test_llnesll(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llnesll:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = zext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define i64 @test_llnesll_sext(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llnesll_sext:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = sext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define i64 @test_llnesll_z(i64 %a) {
-; CHECK-LABEL: test_llnesll_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = zext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define i64 @test_llnesll_sext_z(i64 %a) {
-; CHECK-LABEL: test_llnesll_sext_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = sext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define void @test_llnesll_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llnesll_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_llnesll_sext_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llnesll_sext_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_llnesll_z_store(i64 %a) {
-; CHECK-LABEL: test_llnesll_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_llnesll_sext_z_store(i64 %a) {
-; CHECK-LABEL: test_llnesll_sext_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
diff --git a/test/CodeGen/PowerPC/testComparesllneull.ll b/test/CodeGen/PowerPC/testComparesllneull.ll
deleted file mode 100644
index 7309d5899068b..0000000000000
--- a/test/CodeGen/PowerPC/testComparesllneull.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
-; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
-; RUN:  --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
-@glob = common local_unnamed_addr global i64 0, align 8
-
-define i64 @test_llneull(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llneull:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = zext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define i64 @test_llneull_sext(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llneull_sext:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = sext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define i64 @test_llneull_z(i64 %a) {
-; CHECK-LABEL: test_llneull_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addic r4, r3, -1
-; CHECK-NEXT:    subfe r3, r4, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = zext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define i64 @test_llneull_sext_z(i64 %a) {
-; CHECK-LABEL: test_llneull_sext_z:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = sext i1 %cmp to i64
-  ret i64 %conv1
-}
-
-define void @test_llneull_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llneull_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_llneull_sext_store(i64 %a, i64 %b) {
-; CHECK-LABEL: test_llneull_sext_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r5, r2, .LC0@toc@ha
-; CHECK-NEXT:    xor r3, r3, r4
-; CHECK-NEXT:    ld r12, .LC0@toc@l(r5)
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r12)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, %b
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_llneull_z_store(i64 %a) {
-; CHECK-LABEL: test_llneull_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    addic r5, r3, -1
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r5, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = zext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
-
-define void @test_llneull_sext_z_store(i64 %a) {
-; CHECK-LABEL: test_llneull_sext_z_store:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addis r4, r2, .LC0@toc@ha
-; CHECK-NEXT:    subfic r3, r3, 0
-; CHECK-NEXT:    ld r4, .LC0@toc@l(r4)
-; CHECK-NEXT:    subfe r3, r3, r3
-; CHECK-NEXT:    std r3, 0(r4)
-; CHECK-NEXT:    blr
-entry:
-  %cmp = icmp ne i64 %a, 0
-  %conv1 = sext i1 %cmp to i64
-  store i64 %conv1, i64* @glob, align 8
-  ret void
-}
diff --git a/test/CodeGen/PowerPC/vec_revb.ll b/test/CodeGen/PowerPC/vec_revb.ll
new file mode 100644
index 0000000000000..c09164bae13eb
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_revb.ll
@@ -0,0 +1,54 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
+
+define <8 x i16> @testXXBRH(<8 x i16> %a) {
+; CHECK-LABEL: testXXBRH:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xxbrh 34, 34
+; CHECK-NEXT:    blr
+
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %2 = bitcast <16 x i8> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @testXXBRW(<4 x i32> %a) {
+; CHECK-LABEL: testXXBRW:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xxbrw 34, 34
+; CHECK-NEXT:    blr
+
+entry:
+  %0 = bitcast <4 x i32> %a to <16 x i8>
+  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  %2 = bitcast <16 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <2 x double> @testXXBRD(<2 x double> %a) {
+; CHECK-LABEL: testXXBRD:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xxbrd 34, 34
+; CHECK-NEXT:    blr
+
+entry:
+  %0 = bitcast <2 x double> %a to <16 x i8>
+  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  %2 = bitcast <16 x i8> %1 to <2 x double>
+  ret <2 x double> %2
+}
+
+define <1 x i128> @testXXBRQ(<1 x i128> %a) {
+; CHECK-LABEL: testXXBRQ:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xxbrq 34, 34
+; CHECK-NEXT:    blr
+
+entry:
+  %0 = bitcast <1 x i128> %a to <16 x i8>
+  %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %2 = bitcast <16 x i8> %1 to <1 x i128>
+  ret <1 x i128> %2
+}
diff --git a/test/CodeGen/SystemZ/fp-sincos-01.ll b/test/CodeGen/SystemZ/fp-sincos-01.ll
index cd182a590eee0..4a38d7afba2c9 100644
--- a/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -1,6 +1,6 @@
 ; Test that combined sin/cos library call is emitted when appropriate
 
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-NOOPT
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
 
 define float @f1(float %x) {
@@ -8,10 +8,18 @@ define float @f1(float %x) {
 ; CHECK-OPT: brasl %r14, sincosf@PLT
 ; CHECK-OPT: le %f0, 164(%r15)
 ; CHECK-OPT: aeb %f0, 160(%r15)
+  %tmp1 = call float @sinf(float %x) readnone
+  %tmp2 = call float @cosf(float %x) readnone
+  %add = fadd float %tmp1, %tmp2
+  ret float %add
+}
 
-; CHECK-NOOPT-LABEL: f1:
-; CHECK-NOOPT: brasl %r14, sinf@PLT
-; CHECK-NOOPT: brasl %r14, cosf@PLT
+define float @f1_errno(float %x) {
+; CHECK-OPT-LABEL: f1_errno:
+; CHECK-OPT: brasl %r14, sinf@PLT
+; CHECK-OPT: ler %f9, %f0
+; CHECK-OPT: brasl %r14, cosf@PLT
+; CHECK-OPT: aebr %f0, %f9
   %tmp1 = call float @sinf(float %x)
   %tmp2 = call float @cosf(float %x)
   %add = fadd float %tmp1, %tmp2
@@ -23,10 +31,18 @@ define double @f2(double %x) {
 ; CHECK-OPT: brasl %r14, sincos@PLT
 ; CHECK-OPT: ld %f0, 168(%r15)
 ; CHECK-OPT: adb %f0, 160(%r15)
+  %tmp1 = call double @sin(double %x) readnone
+  %tmp2 = call double @cos(double %x) readnone
+  %add = fadd double %tmp1, %tmp2
+  ret double %add
+}
 
-; CHECK-NOOPT-LABEL: f2:
-; CHECK-NOOPT: brasl %r14, sin@PLT
-; CHECK-NOOPT: brasl %r14, cos@PLT
+define double @f2_errno(double %x) {
+; CHECK-OPT-LABEL: f2_errno:
+; CHECK-OPT: brasl %r14, sin@PLT
+; CHECK-OPT: ldr %f9, %f0
+; CHECK-OPT: brasl %r14, cos@PLT
+; CHECK-OPT: adbr %f0, %f9
   %tmp1 = call double @sin(double %x)
   %tmp2 = call double @cos(double %x)
   %add = fadd double %tmp1, %tmp2
@@ -37,20 +53,27 @@ define fp128 @f3(fp128 %x) {
 ; CHECK-OPT-LABEL: f3:
 ; CHECK-OPT: brasl %r14, sincosl@PLT
 ; CHECK-OPT: axbr
+  %tmp1 = call fp128 @sinl(fp128 %x) readnone
+  %tmp2 = call fp128 @cosl(fp128 %x) readnone
+  %add = fadd fp128 %tmp1, %tmp2
+  ret fp128 %add
+}
 
-; CHECK-NOOPT-LABEL: f3:
-; CHECK-NOOPT: brasl %r14, sinl@PLT
-; CHECK-NOOPT: brasl %r14, cosl@PLT
+define fp128 @f3_errno(fp128 %x) {
+; CHECK-OPT-LABEL: f3_errno:
+; CHECK-OPT: brasl %r14, sinl@PLT
+; CHECK-OPT: brasl %r14, cosl@PLT
+; CHECK-OPT: axbr
   %tmp1 = call fp128 @sinl(fp128 %x)
   %tmp2 = call fp128 @cosl(fp128 %x)
   %add = fadd fp128 %tmp1, %tmp2
   ret fp128 %add
 }
 
-declare float @sinf(float) readonly
-declare double @sin(double) readonly
-declare fp128 @sinl(fp128) readonly
-declare float @cosf(float) readonly
-declare double @cos(double) readonly
-declare fp128 @cosl(fp128) readonly
+declare float @sinf(float)
+declare double @sin(double)
+declare fp128 @sinl(fp128)
+declare float @cosf(float)
+declare double @cos(double)
+declare fp128 @cosl(fp128)
 
diff --git a/test/CodeGen/X86/2012-01-11-split-cv.ll b/test/CodeGen/X86/2012-01-11-split-cv.ll
index 212acedafb940..34ec48a02517d 100644
--- a/test/CodeGen/X86/2012-01-11-split-cv.ll
+++ b/test/CodeGen/X86/2012-01-11-split-cv.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mattr=+avx -mtriple=i686-unknown-unknown | FileCheck %s
 
 define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
@@ -12,7 +12,6 @@ define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind
 ; CHECK-NEXT:    vmovups %ymm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl $4
-;
   %b = load <18 x i16>, <18 x i16>* %bp, align 16
   %x = add <18 x i16> zeroinitializer, %b
   store <18 x i16> %x, <18 x i16>* %ret, align 16
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index 93888c470e2db..47c74175f9497 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -582,12 +582,76 @@ if.end:                                           ; preds = %if.then, %entry
   ret i32 %x.addr.0
 }
 
+;CHECK-LABEL: multi_segment:
+;YESCOLOR: subq  $256, %rsp
+;NOFIRSTUSE: subq  $256, %rsp
+;NOCOLOR: subq  $512, %rsp
+define i1 @multi_segment(i1, i1)
+{
+entry-block:
+  %foo = alloca [32 x i64]
+  %bar = alloca [32 x i64]
+  %foo_i8 = bitcast [32 x i64]* %foo to i8*
+  %bar_i8 = bitcast [32 x i64]* %bar to i8*
+  call void @llvm.lifetime.start.p0i8(i64 256, i8* %bar_i8)
+  call void @baz([32 x i64]* %bar, i32 1)
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %bar_i8)
+  call void @llvm.lifetime.start.p0i8(i64 256, i8* %foo_i8)
+  call void @baz([32 x i64]* %foo, i32 1)
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %foo_i8)
+  call void @llvm.lifetime.start.p0i8(i64 256, i8* %bar_i8)
+  call void @baz([32 x i64]* %bar, i32 1)
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %bar_i8)
+  ret i1 true
+}
+
+;CHECK-LABEL: pr32488:
+;YESCOLOR: subq  $256, %rsp
+;NOFIRSTUSE: subq  $256, %rsp
+;NOCOLOR: subq  $512, %rsp
+define i1 @pr32488(i1, i1)
+{
+entry-block:
+  %foo = alloca [32 x i64]
+  %bar = alloca [32 x i64]
+  %foo_i8 = bitcast [32 x i64]* %foo to i8*
+  %bar_i8 = bitcast [32 x i64]* %bar to i8*
+  br i1 %0, label %if_false, label %if_true
+if_false:
+  call void @llvm.lifetime.start.p0i8(i64 256, i8* %bar_i8)
+  call void @baz([32 x i64]* %bar, i32 0)
+  br i1 %1, label %if_false.1, label %onerr
+if_false.1:
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %bar_i8)
+  br label %merge
+if_true:
+  call void @llvm.lifetime.start.p0i8(i64 256, i8* %foo_i8)
+  call void @baz([32 x i64]* %foo, i32 1)
+  br i1 %1, label %if_true.1, label %onerr
+if_true.1:
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %foo_i8)
+  br label %merge
+merge:
+  ret i1 false
+onerr:
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %foo_i8)
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %bar_i8)
+  call void @destructor()
+  ret i1 true
+}
+
+%Data = type { [32 x i64] }
+
+declare void @destructor()
+
 declare void @inita(i32*)
 
 declare void @initb(i32*,i32*,i32*)
 
 declare void @bar([100 x i32]* , [100 x i32]*) nounwind
 
+declare void @baz([32 x i64]*, i32)
+
 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/add-sub-nsw-nuw.ll b/test/CodeGen/X86/add-sub-nsw-nuw.ll
index f5bffb2386bd8..d02736de55d3a 100644
--- a/test/CodeGen/X86/add-sub-nsw-nuw.ll
+++ b/test/CodeGen/X86/add-sub-nsw-nuw.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i386-apple-darwin < %s | FileCheck %s
 
 ; PR30841: https://llvm.org/bugs/show_bug.cgi?id=30841
@@ -12,7 +12,6 @@ define i8 @PR30841(i64 %argc) {
 ; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retl
-;
 entry:
   %or = or i64 %argc, -4294967296
   br label %end
diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll
index 3c84af4aa9ec6..cffcfd8e8a426 100644
--- a/test/CodeGen/X86/addcarry.ll
+++ b/test/CodeGen/X86/addcarry.ll
@@ -81,6 +81,30 @@ entry:
  ret void
 }
 
+define i8 @e(i32* nocapture %a, i32 %b) nounwind {
+; CHECK-LABEL: e:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT:    movl (%rdi), %ecx
+; CHECK-NEXT:    leal (%rsi,%rcx), %edx
+; CHECK-NEXT:    addl %esi, %edx
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    addl %esi, %ecx
+; CHECK-NEXT:    movl %edx, (%rdi)
+; CHECK-NEXT:    adcb $0, %al
+; CHECK-NEXT:    retq
+  %1 = load i32, i32* %a, align 4
+  %2 = add i32 %1, %b
+  %3 = icmp ult i32 %2, %b
+  %4 = zext i1 %3 to i8
+  %5 = add i32 %2, %b
+  store i32 %5, i32* %a, align 4
+  %6 = icmp ult i32 %5, %b
+  %7 = zext i1 %6 to i8
+  %8 = add nuw nsw i8 %7, %4
+  ret i8 %8
+}
+
 %scalar = type { [4 x i64] }
 
 define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) {
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index f4a77c370db5e..9a21f4b5cabab 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -50,16 +50,10 @@ entry:
 }
 
 define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; AVX1-LABEL: shuffle_v8f32_01230123_mem:
-; AVX1:       ## BB#0: ## %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_01230123_mem:
-; AVX2:       ## BB#0: ## %entry
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_01230123_mem:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; ALL-NEXT:    retq
 entry:
   %a = load <8 x float>, <8 x float>* %pa
   %b = load <8 x float>, <8 x float>* %pb
@@ -195,17 +189,15 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounw
 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
 ; AVX1:       ## BB#0: ## %entry
 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vmovaps (%rsi), %ymm1
 ; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v16i16_4501_mem:
 ; AVX2:       ## BB#0: ## %entry
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
 ; AVX2-NEXT:    retq
 entry:
   %c = load <16 x i16>, <16 x i16>* %a
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index cebcba38bd4fe..0640581156845 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s
 ; PR3253
@@ -24,7 +24,12 @@ define void @test2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB0_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB0_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -44,7 +49,13 @@ define void @test2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB1_1
-;
+; CHECK-NEXT:  # BB#2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -64,7 +75,12 @@ define void @atest2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB2_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB2_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -84,7 +100,13 @@ define void @atest2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB3_1
-;
+; CHECK-NEXT:  # BB#2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB3_1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -104,7 +126,13 @@ define void @test3(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB4_1
-;
+; CHECK-NEXT:  # BB#2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB4_1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %tmp29, %x
@@ -124,7 +152,13 @@ define void @test3b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB5_1
-;
+; CHECK-NEXT:  # BB#2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB5_1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %x, %tmp29
@@ -144,7 +178,12 @@ define void @testne2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB6_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB6_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -164,7 +203,12 @@ define void @testne2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB7_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB7_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -184,7 +228,12 @@ define void @atestne2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB8_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB8_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -204,7 +253,12 @@ define void @atestne2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB9_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB9_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -224,7 +278,12 @@ define void @testne3(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB10_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB10_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %tmp29, %x
@@ -244,7 +303,12 @@ define void @testne3b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB11_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB11_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %x, %tmp29
@@ -264,7 +328,12 @@ define void @query2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB12_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB12_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -284,7 +353,12 @@ define void @query2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB13_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB13_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -304,7 +378,12 @@ define void @aquery2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB14_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB14_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -324,7 +403,12 @@ define void @aquery2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB15_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB15_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -344,7 +428,12 @@ define void @query3(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB16_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB16_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %tmp29, %x
@@ -364,7 +453,12 @@ define void @query3b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB17_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB17_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %x, %tmp29
@@ -384,7 +478,12 @@ define void @query3x(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB18_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB18_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %tmp29, %x
@@ -404,7 +503,12 @@ define void @query3bx(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jae .LBB19_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB19_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %x, %tmp29
@@ -424,7 +528,12 @@ define void @queryne2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB20_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB20_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -444,7 +553,12 @@ define void @queryne2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB21_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB21_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -464,7 +578,12 @@ define void @aqueryne2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB22_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB22_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 %tmp29, 1
@@ -484,7 +603,12 @@ define void @aqueryne2b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB23_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB23_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
@@ -504,7 +628,12 @@ define void @queryne3(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB24_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB24_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %tmp29, %x
@@ -524,7 +653,12 @@ define void @queryne3b(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB25_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB25_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %x, %tmp29
@@ -544,7 +678,12 @@ define void @queryne3x(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB26_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB26_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %tmp29, %x
@@ -564,7 +703,12 @@ define void @queryne3bx(i32 %x, i32 %n) nounwind {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    jb .LBB27_2
-;
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:  .LBB27_2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %x, %tmp29
@@ -588,7 +732,6 @@ define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
 ; CHECK-NEXT:    btl %esi, %edi
 ; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    retq
-;
   %neg = xor i32 %flags, -1
   %shl = shl i32 1, %flag
   %and = and i32 %shl, %neg
@@ -598,8 +741,10 @@ define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
 
 define zeroext i1 @extend(i32 %bit, i64 %bits) {
 ; CHECK-LABEL: extend:
-; CHECK:       # BB#0:
-; CHECK-NEXT:  btl %edi, %esi
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    btl %edi, %esi
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
 entry:
   %and = and i32 %bit, 31
   %sh_prom = zext i32 %and to i64
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
index 6e4762b2e7933..e38039501646f 100644
--- a/test/CodeGen/X86/cmov-into-branch.ll
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
 ; cmp with single-use load, should not form branch.
@@ -9,7 +9,6 @@ define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y)  {
 ; CHECK-NEXT:    cmovbel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
-;
   %load = load double, double* %b, align 8
   %cmp = fcmp olt double %load, %a
   %cond = select i1 %cmp, i32 %x, i32 %y
@@ -24,7 +23,6 @@ define i32 @test2(double %a, double %b, i32 %x, i32 %y)  {
 ; CHECK-NEXT:    cmovbel %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ogt double %a, %b
   %cond = select i1 %cmp, i32 %x, i32 %y
   ret i32 %cond
@@ -39,7 +37,6 @@ define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y)  {
 ; CHECK-NEXT:    cmovael %ecx, %edx
 ; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
-;
   %load = load i32, i32* %b, align 4
   %cmp = icmp ult i32 %load, %a
   %cond = select i1 %cmp, i32 %x, i32 %y
@@ -56,7 +53,6 @@ define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
 ; CHECK-NEXT:    cmovael %edx, %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    retq
-;
   %load = load i32, i32* %b, align 4
   %cmp = icmp ult i32 %load, %a
   %cmp1 = icmp ugt i32 %load, %a
@@ -73,7 +69,6 @@ define i32 @weighted_select1(i32 %a, i32 %b) {
 ; CHECK-NEXT:    cmovnel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
-;
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
   ret i32 %sel
@@ -84,12 +79,12 @@ define i32 @weighted_select2(i32 %a, i32 %b) {
 ; CHECK-LABEL: weighted_select2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    jne [[LABEL_BB5:.*]]
-; CHECK:         movl %esi, %edi
-; CHECK-NEXT:  [[LABEL_BB5]]
+; CHECK-NEXT:    jne .LBB5_2
+; CHECK-NEXT:  # BB#1: # %select.false
+; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:  .LBB5_2: # %select.end
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1
   ret i32 %sel
@@ -103,14 +98,14 @@ define i32 @weighted_select3(i32 %a, i32 %b) {
 ; CHECK-LABEL: weighted_select3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    je [[LABEL_BB6:.*]]
-; CHECK:         movl %edi, %eax
+; CHECK-NEXT:    je .LBB6_1
+; CHECK-NEXT:  # BB#2: # %select.end
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-; CHECK:         [[LABEL_BB6]]
+; CHECK-NEXT:  .LBB6_1: # %select.false
 ; CHECK-NEXT:    movl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
   ret i32 %sel
@@ -124,7 +119,6 @@ define i32 @unweighted_select(i32 %a, i32 %b) {
 ; CHECK-NEXT:    cmovnel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
-;
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3
   ret i32 %sel
diff --git a/test/CodeGen/X86/combine-64bit-vec-binop.ll b/test/CodeGen/X86/combine-64bit-vec-binop.ll
index 2842cb1d9b6e7..2935a2095bbfe 100644
--- a/test/CodeGen/X86/combine-64bit-vec-binop.ll
+++ b/test/CodeGen/X86/combine-64bit-vec-binop.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=SSE41
 
 define double @test1_add(double %A, double %B) {
@@ -6,7 +6,6 @@ define double @test1_add(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    paddd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x i32>
   %2 = bitcast double %B to <2 x i32>
   %add = add <2 x i32> %1, %2
@@ -19,7 +18,6 @@ define double @test2_add(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <4 x i16>
   %2 = bitcast double %B to <4 x i16>
   %add = add <4 x i16> %1, %2
@@ -32,7 +30,6 @@ define double @test3_add(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    paddb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <8 x i8>
   %2 = bitcast double %B to <8 x i8>
   %add = add <8 x i8> %1, %2
@@ -45,7 +42,6 @@ define double @test1_sub(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    psubd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x i32>
   %2 = bitcast double %B to <2 x i32>
   %sub = sub <2 x i32> %1, %2
@@ -58,7 +54,6 @@ define double @test2_sub(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    psubw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <4 x i16>
   %2 = bitcast double %B to <4 x i16>
   %sub = sub <4 x i16> %1, %2
@@ -71,7 +66,6 @@ define double @test3_sub(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    psubb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <8 x i8>
   %2 = bitcast double %B to <8 x i8>
   %sub = sub <8 x i8> %1, %2
@@ -84,7 +78,6 @@ define double @test1_mul(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x i32>
   %2 = bitcast double %B to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -97,7 +90,6 @@ define double @test2_mul(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <4 x i16>
   %2 = bitcast double %B to <4 x i16>
   %mul = mul <4 x i16> %1, %2
@@ -114,7 +106,6 @@ define double @test3_mul(double %A, double %B) {
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <8 x i8>
   %2 = bitcast double %B to <8 x i8>
   %mul = mul <8 x i8> %1, %2
@@ -127,7 +118,6 @@ define double @test1_and(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    andps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x i32>
   %2 = bitcast double %B to <2 x i32>
   %and = and <2 x i32> %1, %2
@@ -140,7 +130,6 @@ define double @test2_and(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    andps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <4 x i16>
   %2 = bitcast double %B to <4 x i16>
   %and = and <4 x i16> %1, %2
@@ -153,7 +142,6 @@ define double @test3_and(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    andps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <8 x i8>
   %2 = bitcast double %B to <8 x i8>
   %and = and <8 x i8> %1, %2
@@ -166,7 +154,6 @@ define double @test1_or(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    orps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x i32>
   %2 = bitcast double %B to <2 x i32>
   %or = or <2 x i32> %1, %2
@@ -179,7 +166,6 @@ define double @test2_or(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    orps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <4 x i16>
   %2 = bitcast double %B to <4 x i16>
   %or = or <4 x i16> %1, %2
@@ -192,7 +178,6 @@ define double @test3_or(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    orps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <8 x i8>
   %2 = bitcast double %B to <8 x i8>
   %or = or <8 x i8> %1, %2
@@ -205,7 +190,6 @@ define double @test1_xor(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x i32>
   %2 = bitcast double %B to <2 x i32>
   %xor = xor <2 x i32> %1, %2
@@ -218,7 +202,6 @@ define double @test2_xor(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <4 x i16>
   %2 = bitcast double %B to <4 x i16>
   %xor = xor <4 x i16> %1, %2
@@ -231,7 +214,6 @@ define double @test3_xor(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <8 x i8>
   %2 = bitcast double %B to <8 x i8>
   %xor = xor <8 x i8> %1, %2
@@ -244,7 +226,6 @@ define double @test_fadd(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x float>
   %2 = bitcast double %B to <2 x float>
   %add = fadd <2 x float> %1, %2
@@ -257,7 +238,6 @@ define double @test_fsub(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    subps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x float>
   %2 = bitcast double %B to <2 x float>
   %sub = fsub <2 x float> %1, %2
@@ -270,7 +250,6 @@ define double @test_fmul(double %A, double %B) {
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    mulps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
-;
   %1 = bitcast double %A to <2 x float>
   %2 = bitcast double %B to <2 x float>
   %mul = fmul <2 x float> %1, %2
diff --git a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
index 4dc5b1ba03398..9dd184c8ab316 100644
--- a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
+++ b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
@@ -2,47 +2,47 @@
 
 define i8* @test_memcpy1(i8* %P, i8* %Q) {
   ; CHECK: test_memcpy
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 1, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 1)
   ret i8* %P
+  ; 3rd arg (%edx) -- length
   ; CHECK-DAG: movl $1, %edx
-  ; CHECK-DAG: movl $1, %ecx
-  ; CHECK: __llvm_memcpy_element_atomic_1
+  ; CHECK: __llvm_memcpy_element_unordered_atomic_1
 }
 
 define i8* @test_memcpy2(i8* %P, i8* %Q) {
   ; CHECK: test_memcpy2
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 2, i32 2)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 2, i32 2)
   ret i8* %P
+  ; 3rd arg (%edx) -- length
   ; CHECK-DAG: movl $2, %edx
-  ; CHECK-DAG: movl $2, %ecx
-  ; CHECK: __llvm_memcpy_element_atomic_2
+  ; CHECK: __llvm_memcpy_element_unordered_atomic_2
 }
 
 define i8* @test_memcpy4(i8* %P, i8* %Q) {
   ; CHECK: test_memcpy4
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 4, i32 4)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 4, i32 4)
   ret i8* %P
+  ; 3rd arg (%edx) -- length
   ; CHECK-DAG: movl $4, %edx
-  ; CHECK-DAG: movl $4, %ecx
-  ; CHECK: __llvm_memcpy_element_atomic_4
+  ; CHECK: __llvm_memcpy_element_unordered_atomic_4
 }
 
 define i8* @test_memcpy8(i8* %P, i8* %Q) {
   ; CHECK: test_memcpy8
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 8 %P, i8* align 8 %Q, i64 8, i32 8)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %P, i8* align 8 %Q, i32 8, i32 8)
   ret i8* %P
+  ; 3rd arg (%edx) -- length
   ; CHECK-DAG: movl $8, %edx
-  ; CHECK-DAG: movl $8, %ecx
-  ; CHECK: __llvm_memcpy_element_atomic_8
+  ; CHECK: __llvm_memcpy_element_unordered_atomic_8
 }
 
 define i8* @test_memcpy16(i8* %P, i8* %Q) {
   ; CHECK: test_memcpy16
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %P, i8* align 16 %Q, i64 16, i32 16)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %P, i8* align 16 %Q, i32 16, i32 16)
   ret i8* %P
+  ; 3rd arg (%edx) -- length
   ; CHECK-DAG: movl $16, %edx
-  ; CHECK-DAG: movl $16, %ecx
-  ; CHECK: __llvm_memcpy_element_atomic_16
+  ; CHECK: __llvm_memcpy_element_unordered_atomic_16
 }
 
 define void @test_memcpy_args(i8** %Storage) {
@@ -51,18 +51,15 @@ define void @test_memcpy_args(i8** %Storage) {
   %Src.addr = getelementptr i8*, i8** %Storage, i64 1
   %Src = load i8*, i8** %Src.addr
 
-  ; First argument
+  ; 1st arg (%rdi)
   ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]]
   ; CHECK-DAG: movq [[REG1]], %rdi
-  ; Second argument
+  ; 2nd arg (%rsi)
   ; CHECK-DAG: movq 8(%rdi), %rsi
-  ; Third argument
+  ; 3rd arg (%edx) -- length
   ; CHECK-DAG: movl $4, %edx
-  ; Fourth argument
-  ; CHECK-DAG: movl $4, %ecx
-  ; CHECK: __llvm_memcpy_element_atomic_4
-  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 4, i32 4)
-  ret void
+  ; CHECK: __llvm_memcpy_element_unordered_atomic_4
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4)  ret void
 }
 
-declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
diff --git a/test/CodeGen/X86/fast-isel-select-sse.ll b/test/CodeGen/X86/fast-isel-select-sse.ll
index 499fe5ba54a29..1b6bb36b77c86 100644
--- a/test/CodeGen/X86/fast-isel-select-sse.ll
+++ b/test/CodeGen/X86/fast-isel-select-sse.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs                                          | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs -fast-isel -fast-isel-abort=1            | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs                               -mattr=avx | FileCheck %s --check-prefix=AVX
@@ -29,7 +29,6 @@ define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp oeq float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -56,7 +55,6 @@ define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp oeq double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -84,7 +82,6 @@ define float @select_fcmp_ogt_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ogt float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -112,7 +109,6 @@ define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ogt double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -140,7 +136,6 @@ define float @select_fcmp_oge_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp oge float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -168,7 +163,6 @@ define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp oge double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -195,7 +189,6 @@ define float @select_fcmp_olt_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp olt float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -222,7 +215,6 @@ define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp olt double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -249,7 +241,6 @@ define float @select_fcmp_ole_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ole float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -276,7 +267,6 @@ define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ole double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -303,7 +293,6 @@ define float @select_fcmp_ord_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ord float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -330,7 +319,6 @@ define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ord double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -357,7 +345,6 @@ define float @select_fcmp_uno_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp uno float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -384,7 +371,6 @@ define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp uno double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -411,7 +397,6 @@ define float @select_fcmp_ugt_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ugt float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -438,7 +423,6 @@ define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ugt double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -465,7 +449,6 @@ define float @select_fcmp_uge_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp uge float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -492,7 +475,6 @@ define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp uge double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -520,7 +502,6 @@ define float @select_fcmp_ult_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ult float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -548,7 +529,6 @@ define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ult double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -576,7 +556,6 @@ define float @select_fcmp_ule_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ule float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -604,7 +583,6 @@ define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp ule double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
@@ -631,7 +609,6 @@ define float @select_fcmp_une_f32(float %a, float %b, float %c, float %d) {
 ; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp une float %a, %b
   %2 = select i1 %1, float %c, float %d
   ret float %2
@@ -658,7 +635,6 @@ define double @select_fcmp_une_f64(double %a, double %b, double %c, double %d) {
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm3 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm3, %xmm0
 ; AVX512-NEXT:    retq
-;
   %1 = fcmp une double %a, %b
   %2 = select i1 %1, double %c, double %d
   ret double %2
diff --git a/test/CodeGen/X86/fp-logic-replace.ll b/test/CodeGen/X86/fp-logic-replace.ll
index 308b42e10caa6..e62b2f3db237d 100644
--- a/test/CodeGen/X86/fp-logic-replace.ll
+++ b/test/CodeGen/X86/fp-logic-replace.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding -mattr=+sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding -mattr=+avx  | FileCheck %s --check-prefix=AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding -mattr=+avx512dq,+avx512vl  | FileCheck %s --check-prefix=AVX512DQ
diff --git a/test/CodeGen/X86/fp-logic.ll b/test/CodeGen/X86/fp-logic.ll
index 973e0644b4e98..976470a83030c 100644
--- a/test/CodeGen/X86/fp-logic.ll
+++ b/test/CodeGen/X86/fp-logic.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck %s
 
 ; PR22428: https://llvm.org/bugs/show_bug.cgi?id=22428
@@ -22,7 +22,6 @@ define i32 @f1(float %x, i32 %y) {
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 %bc1, %y
   ret i32 %and
@@ -36,7 +35,6 @@ define i32 @f2(float %x, i32 %y) {
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 %y, %bc1
   ret i32 %and
@@ -50,7 +48,6 @@ define i32 @f3(float %x) {
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 %bc1, 1
   ret i32 %and
@@ -64,7 +61,6 @@ define i32 @f4(float %x) {
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    andl $2, %eax
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 2, %bc1
   ret i32 %and
@@ -78,7 +74,6 @@ define float @f5(float %x, i32 %y) {
 ; CHECK-NEXT:    movd %edi, %xmm1
 ; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 %bc1, %y
   %bc2 = bitcast i32 %and to float
@@ -93,7 +88,6 @@ define float @f6(float %x, i32 %y) {
 ; CHECK-NEXT:    movd %edi, %xmm1
 ; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 %y, %bc1
   %bc2 = bitcast i32 %and to float
@@ -108,7 +102,6 @@ define float @f7(float %x) {
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 %bc1, 3
   %bc2 = bitcast i32 %and to float
@@ -123,7 +116,6 @@ define float @f8(float %x) {
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 4, %bc1
   %bc2 = bitcast i32 %and to float
@@ -138,7 +130,6 @@ define i32 @f9(float %x, float %y) {
 ; CHECK-NEXT:    pand %xmm1, %xmm0
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %bc2 = bitcast float %y to i32
   %and = and i32 %bc1, %bc2
@@ -152,7 +143,6 @@ define float @f10(float %x, float %y) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %bc2 = bitcast float %y to i32
   %and = and i32 %bc1, %bc2
@@ -165,7 +155,6 @@ define float @or(float %x, float %y) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %bc2 = bitcast float %y to i32
   %and = or i32 %bc1, %bc2
@@ -178,7 +167,6 @@ define float @xor(float %x, float %y) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %bc2 = bitcast float %y to i32
   %and = xor i32 %bc1, %bc2
@@ -192,7 +180,6 @@ define float @f7_or(float %x) {
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = or i32 %bc1, 3
   %bc2 = bitcast i32 %and to float
@@ -205,7 +192,6 @@ define float @f7_xor(float %x) {
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = xor i32 %bc1, 3
   %bc2 = bitcast i32 %and to float
@@ -219,7 +205,6 @@ define double @doubles(double %x, double %y) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast double %x to i64
   %bc2 = bitcast double %y to i64
   %and = and i64 %bc1, %bc2
@@ -233,7 +218,6 @@ define double @f7_double(double %x) {
 ; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast double %x to i64
   %and = and i64 %bc1, 3
   %bc2 = bitcast i64 %and to double
@@ -250,7 +234,6 @@ define float @movmsk(float %x) {
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %and = and i32 %bc1, 2147483648
   %bc2 = bitcast i32 %and to float
@@ -262,7 +245,6 @@ define double @bitcast_fabs(double %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast double %x to i64
   %and = and i64 %bc1, 9223372036854775807
   %bc2 = bitcast i64 %and to double
@@ -274,7 +256,6 @@ define float @bitcast_fneg(float %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast float %x to i32
   %xor = xor i32 %bc1, 2147483648
   %bc2 = bitcast i32 %xor to float
@@ -286,7 +267,6 @@ define <2 x double> @bitcast_fabs_vec(<2 x double> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast <2 x double> %x to <2 x i64>
   %and = and <2 x i64> %bc1, <i64 9223372036854775807, i64 9223372036854775807>
   %bc2 = bitcast <2 x i64> %and to <2 x double>
@@ -298,7 +278,6 @@ define <4 x float> @bitcast_fneg_vec(<4 x float> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %bc1 = bitcast <4 x float> %x to <4 x i32>
   %xor = xor <4 x i32> %bc1, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
   %bc2 = bitcast <4 x i32> %xor to <4 x float>
diff --git a/test/CodeGen/X86/fp-select-cmp-and.ll b/test/CodeGen/X86/fp-select-cmp-and.ll
index e012809cf480b..651d7a3351c67 100644
--- a/test/CodeGen/X86/fp-select-cmp-and.ll
+++ b/test/CodeGen/X86/fp-select-cmp-and.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s
 
 define double @test1(double %a, double %b, double %eps) {
@@ -7,7 +7,6 @@ define double @test1(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    cmpltsd %xmm2, %xmm0
 ; CHECK-NEXT:    andpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp olt double %a, %eps
   %cond = select i1 %cmp, double %b, double 0.000000e+00
   ret double %cond
@@ -19,7 +18,6 @@ define double @test2(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    cmplesd %xmm2, %xmm0
 ; CHECK-NEXT:    andpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ole double %a, %eps
   %cond = select i1 %cmp, double %b, double 0.000000e+00
   ret double %cond
@@ -32,7 +30,6 @@ define double @test3(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    andpd %xmm1, %xmm2
 ; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ogt double %a, %eps
   %cond = select i1 %cmp, double %b, double 0.000000e+00
   ret double %cond
@@ -45,7 +42,6 @@ define double @test4(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    andpd %xmm1, %xmm2
 ; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp oge double %a, %eps
   %cond = select i1 %cmp, double %b, double 0.000000e+00
   ret double %cond
@@ -57,7 +53,6 @@ define double @test5(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    cmpltsd %xmm2, %xmm0
 ; CHECK-NEXT:    andnpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp olt double %a, %eps
   %cond = select i1 %cmp, double 0.000000e+00, double %b
   ret double %cond
@@ -69,7 +64,6 @@ define double @test6(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    cmplesd %xmm2, %xmm0
 ; CHECK-NEXT:    andnpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ole double %a, %eps
   %cond = select i1 %cmp, double 0.000000e+00, double %b
   ret double %cond
@@ -82,7 +76,6 @@ define double @test7(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    andnpd %xmm1, %xmm2
 ; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ogt double %a, %eps
   %cond = select i1 %cmp, double 0.000000e+00, double %b
   ret double %cond
@@ -95,7 +88,6 @@ define double @test8(double %a, double %b, double %eps) {
 ; CHECK-NEXT:    andnpd %xmm1, %xmm2
 ; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp oge double %a, %eps
   %cond = select i1 %cmp, double 0.000000e+00, double %b
   ret double %cond
@@ -107,7 +99,6 @@ define float @test9(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    cmpltss %xmm2, %xmm0
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp olt float %a, %eps
   %cond = select i1 %cmp, float %b, float 0.000000e+00
   ret float %cond
@@ -119,7 +110,6 @@ define float @test10(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    cmpless %xmm2, %xmm0
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ole float %a, %eps
   %cond = select i1 %cmp, float %b, float 0.000000e+00
   ret float %cond
@@ -132,7 +122,6 @@ define float @test11(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    andps %xmm1, %xmm2
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ogt float %a, %eps
   %cond = select i1 %cmp, float %b, float 0.000000e+00
   ret float %cond
@@ -145,7 +134,6 @@ define float @test12(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    andps %xmm1, %xmm2
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp oge float %a, %eps
   %cond = select i1 %cmp, float %b, float 0.000000e+00
   ret float %cond
@@ -157,7 +145,6 @@ define float @test13(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    cmpltss %xmm2, %xmm0
 ; CHECK-NEXT:    andnps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp olt float %a, %eps
   %cond = select i1 %cmp, float 0.000000e+00, float %b
   ret float %cond
@@ -169,7 +156,6 @@ define float @test14(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    cmpless %xmm2, %xmm0
 ; CHECK-NEXT:    andnps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ole float %a, %eps
   %cond = select i1 %cmp, float 0.000000e+00, float %b
   ret float %cond
@@ -182,7 +168,6 @@ define float @test15(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    andnps %xmm1, %xmm2
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp ogt float %a, %eps
   %cond = select i1 %cmp, float 0.000000e+00, float %b
   ret float %cond
@@ -195,7 +180,6 @@ define float @test16(float %a, float %b, float %eps) {
 ; CHECK-NEXT:    andnps %xmm1, %xmm2
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp oge float %a, %eps
   %cond = select i1 %cmp, float 0.000000e+00, float %b
   ret float %cond
@@ -210,7 +194,6 @@ define float @test17(float %a, float %b, float %c, float %eps) {
 ; CHECK-NEXT:    orps %xmm2, %xmm3
 ; CHECK-NEXT:    movaps %xmm3, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp oge float %a, %eps
   %cond = select i1 %cmp, float %c, float %b
   ret float %cond
@@ -225,7 +208,6 @@ define double @test18(double %a, double %b, double %c, double %eps) {
 ; CHECK-NEXT:    orpd %xmm2, %xmm3
 ; CHECK-NEXT:    movapd %xmm3, %xmm0
 ; CHECK-NEXT:    retq
-;
   %cmp = fcmp oge double %a, %eps
   %cond = select i1 %cmp, double %c, double %b
   ret double %cond
diff --git a/test/CodeGen/X86/immediate_merging64.ll b/test/CodeGen/X86/immediate_merging64.ll
index ea8ace12a868c..4bc9d4af64403 100644
--- a/test/CodeGen/X86/immediate_merging64.ll
+++ b/test/CodeGen/X86/immediate_merging64.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 ; Check that multiple instances of 64-bit constants encodable as
@@ -14,7 +14,6 @@ define i1 @imm_multiple_users(i64 %a, i64* %b) optsize {
 ; CHECK-NEXT:    cmpq %rax, %rdi
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
-;
   store i64 -1, i64* %b, align 8
   %cmp = icmp eq i64 %a, -1
   ret i1 %cmp
@@ -32,7 +31,6 @@ define void @memset_zero(i8* noalias nocapture %D) optsize {
 ; CHECK-NEXT:    movq %rax, 7(%rdi)
 ; CHECK-NEXT:    movq %rax, (%rdi)
 ; CHECK-NEXT:    retq
-;
   tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/X86/lea-opt-with-debug.mir b/test/CodeGen/X86/lea-opt-with-debug.mir
index 0a477706df15d..03a745888b5a0 100644
--- a/test/CodeGen/X86/lea-opt-with-debug.mir
+++ b/test/CodeGen/X86/lea-opt-with-debug.mir
@@ -49,7 +49,7 @@
   !5 = !{i32 2, !"Dwarf Version", i32 4}
   !6 = !{i32 2, !"Debug Info Version", i32 3}
   !7 = !{i32 1, !"PIC Level", i32 2}
-  !8 = !DIExpression(DW_OP_plus, 8, DW_OP_stack_value)
+  !8 = !DIExpression(DW_OP_plus_uconst, 8, DW_OP_stack_value)
   !9 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 7, type: !10, isLocal: false, isDefinition: true, scopeLine: 7, isOptimized: true, unit: !0, variables: !11)
   !10 = !DISubroutineType(types: !3)
   !11 = !{!12}
diff --git a/test/CodeGen/X86/loop-search.ll b/test/CodeGen/X86/loop-search.ll
index 6b29a726fc1f3..fda4ecec0e6ab 100644
--- a/test/CodeGen/X86/loop-search.ll
+++ b/test/CodeGen/X86/loop-search.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
 
 ; This test comes from PR27136
@@ -35,7 +35,6 @@ define zeroext i1 @search(i32 %needle, i32* nocapture readonly %haystack, i32 %c
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
-;
 entry:
   %cmp5 = icmp sgt i32 %count, 0
   br i1 %cmp5, label %for.body.preheader, label %cleanup
diff --git a/test/CodeGen/X86/mask-negated-bool.ll b/test/CodeGen/X86/mask-negated-bool.ll
index c5c121c52966c..779641cee7d22 100644
--- a/test/CodeGen/X86/mask-negated-bool.ll
+++ b/test/CodeGen/X86/mask-negated-bool.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 define i32 @mask_negated_zext_bool1(i1 %x) {
@@ -7,7 +7,6 @@ define i32 @mask_negated_zext_bool1(i1 %x) {
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %ext = zext i1 %x to i32
   %neg = sub i32 0, %ext
   %and = and i32 %neg, 1
@@ -19,7 +18,6 @@ define i32 @mask_negated_zext_bool2(i1 zeroext %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movzbl %dil, %eax
 ; CHECK-NEXT:    retq
-;
   %ext = zext i1 %x to i32
   %neg = sub i32 0, %ext
   %and = and i32 %neg, 1
@@ -31,7 +29,6 @@ define <4 x i32> @mask_negated_zext_bool_vec(<4 x i1> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %ext = zext <4 x i1> %x to <4 x i32>
   %neg = sub <4 x i32> zeroinitializer, %ext
   %and = and <4 x i32> %neg, <i32 1, i32 1, i32 1, i32 1>
@@ -44,7 +41,6 @@ define i32 @mask_negated_sext_bool1(i1 %x) {
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %ext = sext i1 %x to i32
   %neg = sub i32 0, %ext
   %and = and i32 %neg, 1
@@ -56,7 +52,6 @@ define i32 @mask_negated_sext_bool2(i1 zeroext %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movzbl %dil, %eax
 ; CHECK-NEXT:    retq
-;
   %ext = sext i1 %x to i32
   %neg = sub i32 0, %ext
   %and = and i32 %neg, 1
@@ -68,7 +63,6 @@ define <4 x i32> @mask_negated_sext_bool_vec(<4 x i1> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %ext = sext <4 x i1> %x to <4 x i32>
   %neg = sub <4 x i32> zeroinitializer, %ext
   %and = and <4 x i32> %neg, <i32 1, i32 1, i32 1, i32 1>
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index a02ef29ca6b37..1ac972048f12b 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s
 
 define fastcc void @t1() nounwind {
@@ -10,7 +10,6 @@ define fastcc void @t1() nounwind {
 ; CHECK-NEXT:    pushl $0
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $16, %esp
-;
 entry:
   call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i32 1, i1 false)
   unreachable
@@ -23,7 +22,6 @@ define fastcc void @t2(i8 signext %c) nounwind {
 ; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $76, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    calll _memset
-;
 entry:
   call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i32 1, i1 false)
   unreachable
@@ -40,7 +38,6 @@ define void @t3(i8* nocapture %s, i8 %a) nounwind {
 ; CHECK-NEXT:    movl %ecx, 4(%eax)
 ; CHECK-NEXT:    movl %ecx, (%eax)
 ; CHECK-NEXT:    retl
-;
 entry:
   tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 8, i32 1, i1 false)
   ret void
@@ -58,7 +55,6 @@ define void @t4(i8* nocapture %s, i8 %a) nounwind {
 ; CHECK-NEXT:    movw %cx, 12(%eax)
 ; CHECK-NEXT:    movb %cl, 14(%eax)
 ; CHECK-NEXT:    retl
-;
 entry:
   tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
index 769fe87880b0f..13258fd81de5b 100644
--- a/test/CodeGen/X86/memset-nonzero.ll
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
@@ -26,7 +26,6 @@ define void @memset_16_nonzero_bytes(i8* %x) {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
 ; AVX-NEXT:    vmovups %xmm0, (%rdi)
 ; AVX-NEXT:    retq
-;
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
   ret void
 }
@@ -54,7 +53,6 @@ define void @memset_32_nonzero_bytes(i8* %x) {
 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-;
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
   ret void
 }
@@ -89,7 +87,6 @@ define void @memset_64_nonzero_bytes(i8* %x) {
 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-;
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
   ret void
 }
@@ -138,7 +135,6 @@ define void @memset_128_nonzero_bytes(i8* %x) {
 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-;
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
   ret void
 }
@@ -189,7 +185,6 @@ define void @memset_256_nonzero_bytes(i8* %x) {
 ; AVX-NEXT:    vmovups %ymm0, (%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
-;
   %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
   ret void
 }
@@ -231,7 +226,6 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
-;
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false)
   ret void
 }
@@ -275,7 +269,6 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
-;
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i32 1, i1 false)
   ret void
 }
@@ -327,7 +320,6 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
-;
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i32 1, i1 false)
   ret void
 }
@@ -395,7 +387,6 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
-;
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i32 1, i1 false)
   ret void
 }
@@ -461,7 +452,6 @@ define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
 ; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
-;
   tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index 861cb88b0f573..a7a3c61b13925 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-unknown   -mattr=sse4.2 | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -mtriple=i386-unknown-unknown   -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_64
@@ -51,7 +51,6 @@ define void @bork() nounwind {
 ; SLOW_64-NEXT:    movq $0, 8
 ; SLOW_64-NEXT:    movq $0, 0
 ; SLOW_64-NEXT:    retq
-;
   call void @llvm.memset.p0i8.i64(i8* null, i8 0, i64 80, i32 4, i1 false)
   ret void
 }
diff --git a/test/CodeGen/X86/negate-i1.ll b/test/CodeGen/X86/negate-i1.ll
index f1678a1b22ff9..13f831fd37b7b 100644
--- a/test/CodeGen/X86/negate-i1.ll
+++ b/test/CodeGen/X86/negate-i1.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=i386-unknown-unknown   | FileCheck %s --check-prefix=X32
 
@@ -16,7 +16,6 @@ define i8 @select_i8_neg1_or_0(i1 %a) {
 ; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    negb %al
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i8
   ret i8 %b
 }
@@ -33,7 +32,6 @@ define i8 @select_i8_neg1_or_0_zeroext(i1 zeroext %a) {
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X32-NEXT:    negb %al
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i8
   ret i8 %b
 }
@@ -53,7 +51,6 @@ define i16 @select_i16_neg1_or_0(i1 %a) {
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i16
   ret i16 %b
 }
@@ -72,7 +69,6 @@ define i16 @select_i16_neg1_or_0_zeroext(i1 zeroext %a) {
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i16
   ret i16 %b
 }
@@ -91,7 +87,6 @@ define i32 @select_i32_neg1_or_0(i1 %a) {
 ; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i32
   ret i32 %b
 }
@@ -108,7 +103,6 @@ define i32 @select_i32_neg1_or_0_zeroext(i1 zeroext %a) {
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i32
   ret i32 %b
 }
@@ -129,7 +123,6 @@ define i64 @select_i64_neg1_or_0(i1 %a) {
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i64
   ret i64 %b
 }
@@ -147,7 +140,6 @@ define i64 @select_i64_neg1_or_0_zeroext(i1 zeroext %a) {
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    retl
-;
   %b = sext i1 %a to i64
   ret i64 %b
 }
diff --git a/test/CodeGen/X86/negate-shift.ll b/test/CodeGen/X86/negate-shift.ll
index 54ffc8e71e07c..cbe2f9456fa1c 100644
--- a/test/CodeGen/X86/negate-shift.ll
+++ b/test/CodeGen/X86/negate-shift.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
 
 define i32 @neg_lshr_signbit(i32 %x) {
@@ -7,7 +7,6 @@ define i32 @neg_lshr_signbit(i32 %x) {
 ; X64-NEXT:    sarl $31, %edi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    retq
-;
   %sh = lshr i32 %x, 31
   %neg = sub i32 0, %sh
   ret i32 %neg
@@ -19,7 +18,6 @@ define i64 @neg_ashr_signbit(i64 %x) {
 ; X64-NEXT:    shrq $63, %rdi
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
-;
   %sh = ashr i64 %x, 63
   %neg = sub i64 0, %sh
   ret i64 %neg
@@ -30,7 +28,6 @@ define <4 x i32> @neg_ashr_signbit_vec(<4 x i32> %x) {
 ; X64:       # BB#0:
 ; X64-NEXT:    psrld $31, %xmm0
 ; X64-NEXT:    retq
-;
   %sh = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
   %neg = sub <4 x i32> zeroinitializer, %sh
   ret <4 x i32> %neg
@@ -41,7 +38,6 @@ define <8 x i16> @neg_lshr_signbit_vec(<8 x i16> %x) {
 ; X64:       # BB#0:
 ; X64-NEXT:    psraw $15, %xmm0
 ; X64-NEXT:    retq
-;
   %sh = lshr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %neg = sub <8 x i16> zeroinitializer, %sh
   ret <8 x i16> %neg
diff --git a/test/CodeGen/X86/negate.ll b/test/CodeGen/X86/negate.ll
index 6f07378e0e46b..5bdb11479afc6 100644
--- a/test/CodeGen/X86/negate.ll
+++ b/test/CodeGen/X86/negate.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 define i32 @negate_nuw(i32 %x) {
@@ -6,7 +6,6 @@ define i32 @negate_nuw(i32 %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    retq
-;
   %neg = sub nuw i32 0, %x
   ret i32 %neg
 }
@@ -16,7 +15,6 @@ define <4 x i32> @negate_nuw_vec(<4 x i32> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %neg = sub nuw <4 x i32> zeroinitializer, %x
   ret <4 x i32> %neg
 }
@@ -26,7 +24,6 @@ define i8 @negate_zero_or_minsigned_nsw(i8 %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    retq
-;
   %signbit = and i8 %x, 128
   %neg = sub nsw i8 0, %signbit
   ret i8 %neg
@@ -37,7 +34,6 @@ define <4 x i32> @negate_zero_or_minsigned_nsw_vec(<4 x i32> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %signbit = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
   %neg = sub nsw <4 x i32> zeroinitializer, %signbit
   ret <4 x i32> %neg
@@ -49,7 +45,6 @@ define i8 @negate_zero_or_minsigned(i8 %x) {
 ; CHECK-NEXT:    shlb $7, %dil
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %signbit = shl i8 %x, 7
   %neg = sub i8 0, %signbit
   ret i8 %neg
@@ -60,7 +55,6 @@ define <4 x i32> @negate_zero_or_minsigned_vec(<4 x i32> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %signbit = and <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
   %neg = sub <4 x i32> zeroinitializer, %signbit
   ret <4 x i32> %neg
diff --git a/test/CodeGen/X86/negative-sin.ll b/test/CodeGen/X86/negative-sin.ll
index bc38021b5620c..94369e3e8d0fb 100644
--- a/test/CodeGen/X86/negative-sin.ll
+++ b/test/CodeGen/X86/negative-sin.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
 
 declare double @sin(double %f)
@@ -16,7 +16,6 @@ define double @strict(double %e) nounwind {
 ; CHECK-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-;
   %f = fsub double 0.0, %e
   %g = call double @sin(double %f) readonly
   %h = fsub double 0.0, %g
@@ -29,8 +28,7 @@ define double @strict(double %e) nounwind {
 define double @fast(double %e) nounwind {
 ; CHECK-LABEL: fast:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    jmp sin
-;
+; CHECK-NEXT:    jmp sin # TAILCALL
   %f = fsub fast double 0.0, %e
   %g = call double @sin(double %f) readonly
   %h = fsub fast double 0.0, %g
@@ -42,8 +40,7 @@ define double @fast(double %e) nounwind {
 define double @nsz(double %e) nounwind {
 ; CHECK-LABEL: nsz:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    jmp sin
-;
+; CHECK-NEXT:    jmp sin # TAILCALL
   %f = fsub nsz double 0.0, %e
   %g = call double @sin(double %f) readonly
   %h = fsub nsz double 0.0, %g
@@ -62,7 +59,6 @@ define double @semi_strict1(double %e) nounwind {
 ; CHECK-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-;
   %f = fsub double 0.0, %e
   %g = call double @sin(double %f) readonly
   %h = fsub nsz double 0.0, %g
@@ -80,7 +76,6 @@ define double @semi_strict2(double %e) nounwind {
 ; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-;
   %f = fsub nsz double 0.0, %e
   %g = call double @sin(double %f) readonly
   %h = fsub double 0.0, %g
@@ -93,8 +88,7 @@ define double @semi_strict2(double %e) nounwind {
 define double @fn_attr(double %e) nounwind #0 {
 ; CHECK-LABEL: fn_attr:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    jmp sin
-;
+; CHECK-NEXT:    jmp sin # TAILCALL
   %f = fsub double 0.0, %e
   %g = call double @sin(double %f) readonly
   %h = fsub double 0.0, %g
diff --git a/test/CodeGen/X86/no-sse2-avg.ll b/test/CodeGen/X86/no-sse2-avg.ll
index 0ed0a7f74cb3d..e4b97c17047cd 100644
--- a/test/CodeGen/X86/no-sse2-avg.ll
+++ b/test/CodeGen/X86/no-sse2-avg.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s
 
@@ -23,7 +23,6 @@ define <16 x i8> @PR27973() {
 ; CHECK-NEXT:    movb $0, (%rdi)
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
-;
   %t0 = zext <16 x i8> zeroinitializer to <16 x i32>
   %t1 = add nuw nsw <16 x i32> %t0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %t2 = lshr <16 x i32> %t1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll
index 83b2be83d5524..87aa10a6e2960 100644
--- a/test/CodeGen/X86/not-and-simplify.ll
+++ b/test/CodeGen/X86/not-and-simplify.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefix=ALL --check-prefix=NO_BMI
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=ALL --check-prefix=BMI
 
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
index 1b1622513ea65..665df2c183bf2 100644
--- a/test/CodeGen/X86/pr13577.ll
+++ b/test/CodeGen/X86/pr13577.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-darwin | FileCheck %s
 
 ; CHECK-LABEL: LCPI0_0:
@@ -12,12 +12,11 @@ define x86_fp80 @foo(x86_fp80 %a) {
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fstpt -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    testb $-128, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    flds LCPI0_0(%rip)
-; CHECK-NEXT:    flds LCPI0_1(%rip)
+; CHECK-NEXT:    flds {{.*}}(%rip)
+; CHECK-NEXT:    flds {{.*}}(%rip)
 ; CHECK-NEXT:    fcmovne %st(1), %st(0)
 ; CHECK-NEXT:    fstp %st(1)
 ; CHECK-NEXT:    retq
-;
   %1 = tail call x86_fp80 @copysignl(x86_fp80 0xK7FFF8000000000000000, x86_fp80 %a) nounwind readnone
   ret x86_fp80 %1
 }
@@ -34,7 +33,6 @@ define float @pr26070() {
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; CHECK-NEXT:    orps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %c = call float @copysignf(float 1.0, float undef) readnone
   ret float %c
 }
diff --git a/test/CodeGen/X86/pr18014.ll b/test/CodeGen/X86/pr18014.ll
index bb3b9c23f1e3f..cba065002d578 100644
--- a/test/CodeGen/X86/pr18014.ll
+++ b/test/CodeGen/X86/pr18014.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s
 
 ; Ensure PSRAD is generated as the condition is consumed by both PADD and
@@ -14,7 +14,6 @@ define <4 x i32> @foo(<4 x i32>* %p, <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v
 ; CHECK-NEXT:    movaps %xmm2, (%rdi)
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %sext_cond = sext <4 x i1> %cond to <4 x i32>
   %t1 = add <4 x i32> %v1, %sext_cond
   %t2 = select <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2
diff --git a/test/CodeGen/X86/pr32368.ll b/test/CodeGen/X86/pr32368.ll
new file mode 100644
index 0000000000000..b0f0b123cca10
--- /dev/null
+++ b/test/CodeGen/X86/pr32368.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+
+define <4 x float> @PR32368_128(<4 x float>) {
+; SSE-LABEL: PR32368_128:
+; SSE:       # BB#0:
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    addps %xmm0, %xmm0
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR32368_128:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR32368_128:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR32368_128:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %2 = bitcast <4 x float> %0 to <4 x i32>
+  %3 = and <4 x i32> %2, <i32 -292, i32 -292, i32 -292, i32 -292>
+  %4 = bitcast <4 x i32> %3 to <4 x float>
+  %5 = fmul <4 x float> %4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %6 = bitcast <4 x float> %5 to <4 x i32>
+  %7 = and <4 x i32> %6, <i32 291, i32 291, i32 291, i32 291>
+  %8 = bitcast <4 x i32> %7 to <4 x float>
+  ret <4 x float> %8
+}
+
+define <8 x float> @PR32368_256(<8 x float>) {
+; SSE-LABEL: PR32368_256:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [4294967004,4294967004,4294967004,4294967004]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    addps %xmm1, %xmm1
+; SSE-NEXT:    addps %xmm0, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [291,291,291,291]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR32368_256:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR32368_256:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR32368_256:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vaddps %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %2 = bitcast <8 x float> %0 to <8 x i32>
+  %3 = and <8 x i32> %2, <i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292>
+  %4 = bitcast <8 x i32> %3 to <8 x float>
+  %5 = fmul <8 x float> %4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %6 = bitcast <8 x float> %5 to <8 x i32>
+  %7 = and <8 x i32> %6, <i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291>
+  %8 = bitcast <8 x i32> %7 to <8 x float>
+  ret <8 x float> %8
+}
+
+define <16 x float> @PR32368_512(<16 x float>) {
+; SSE-LABEL: PR32368_512:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [4294967004,4294967004,4294967004,4294967004]
+; SSE-NEXT:    andps %xmm4, %xmm0
+; SSE-NEXT:    andps %xmm4, %xmm1
+; SSE-NEXT:    andps %xmm4, %xmm2
+; SSE-NEXT:    andps %xmm4, %xmm3
+; SSE-NEXT:    addps %xmm3, %xmm3
+; SSE-NEXT:    addps %xmm2, %xmm2
+; SSE-NEXT:    addps %xmm1, %xmm1
+; SSE-NEXT:    addps %xmm0, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [291,291,291,291]
+; SSE-NEXT:    andps %xmm4, %xmm0
+; SSE-NEXT:    andps %xmm4, %xmm1
+; SSE-NEXT:    andps %xmm4, %xmm2
+; SSE-NEXT:    andps %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR32368_512:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm0, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR32368_512:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR32368_512:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-NEXT:    vaddps %zmm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %2 = bitcast <16 x float> %0 to <16 x i32>
+  %3 = and <16 x i32> %2, <i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292, i32 -292>
+  %4 = bitcast <16 x i32> %3 to <16 x float>
+  %5 = fmul <16 x float> %4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %6 = bitcast <16 x float> %5 to <16 x i32>
+  %7 = and <16 x i32> %6, <i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291, i32 291>
+  %8 = bitcast <16 x i32> %7 to <16 x float>
+  ret <16 x float> %8
+}
diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll
index cc591e5ac00bf..7b138f02eb4a8 100644
--- a/test/CodeGen/X86/rem.ll
+++ b/test/CodeGen/X86/rem.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
 
 define i32 @test1(i32 %X) {
@@ -19,7 +19,6 @@ define i32 @test1(i32 %X) {
 ; CHECK-NEXT:    subl %eax, %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    retl
-;
   %tmp1 = srem i32 %X, 255
   ret i32 %tmp1
 }
@@ -35,7 +34,6 @@ define i32 @test2(i32 %X) {
 ; CHECK-NEXT:    andl $-256, %ecx
 ; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    retl
-;
   %tmp1 = srem i32 %X, 256
   ret i32 %tmp1
 }
@@ -54,7 +52,6 @@ define i32 @test3(i32 %X) {
 ; CHECK-NEXT:    subl %eax, %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    retl
-;
   %tmp1 = urem i32 %X, 255
   ret i32 %tmp1
 }
@@ -64,7 +61,6 @@ define i32 @test4(i32 %X) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    retl
-;
   %tmp1 = urem i32 %X, 256
   ret i32 %tmp1
 }
@@ -77,8 +73,8 @@ define i32 @test5(i32 %X) nounwind readnone {
 ; CHECK-NEXT:    idivl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    retl
-;
 entry:
   %0 = srem i32 41, %X
   ret i32 %0
 }
+
diff --git a/test/CodeGen/X86/sar_fold64.ll b/test/CodeGen/X86/sar_fold64.ll
index 213ca95fc78d5..66ad8c3f40fa7 100644
--- a/test/CodeGen/X86/sar_fold64.ll
+++ b/test/CodeGen/X86/sar_fold64.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 define i32 @shl48sar47(i64 %a) #0 {
@@ -8,7 +8,6 @@ define i32 @shl48sar47(i64 %a) #0 {
 ; CHECK-NEXT:    addl %eax, %eax
 ; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
 ; CHECK-NEXT:    retq
-;
   %1 = shl i64 %a, 48
   %2 = ashr exact i64 %1, 47
   %3 = trunc i64 %2 to i32
@@ -22,7 +21,6 @@ define i32 @shl48sar49(i64 %a) #0 {
 ; CHECK-NEXT:    shrq %rax
 ; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
 ; CHECK-NEXT:    retq
-;
   %1 = shl i64 %a, 48
   %2 = ashr exact i64 %1, 49
   %3 = trunc i64 %2 to i32
@@ -36,7 +34,6 @@ define i32 @shl56sar55(i64 %a) #0 {
 ; CHECK-NEXT:    addl %eax, %eax
 ; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
 ; CHECK-NEXT:    retq
-;
   %1 = shl i64 %a, 56
   %2 = ashr exact i64 %1, 55
   %3 = trunc i64 %2 to i32
@@ -50,7 +47,6 @@ define i32 @shl56sar57(i64 %a) #0 {
 ; CHECK-NEXT:    shrq %rax
 ; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
 ; CHECK-NEXT:    retq
-;
   %1 = shl i64 %a, 56
   %2 = ashr exact i64 %1, 57
   %3 = trunc i64 %2 to i32
@@ -64,7 +60,6 @@ define i8 @all_sign_bit_ashr(i8 %x) {
 ; CHECK-NEXT:    negb %dil
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %and = and i8 %x, 1
   %neg = sub i8 0, %and
   %sar = ashr i8 %neg, 6
@@ -79,7 +74,6 @@ define <4 x i32> @all_sign_bit_ashr_vec(<4 x i32> %x) {
 ; CHECK-NEXT:    psubd %xmm0, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %and = and <4 x i32> %x, <i32 1, i32 1, i32 1 , i32 1>
   %neg = sub <4 x i32> zeroinitializer, %and
   %sar = ashr <4 x i32> %neg, <i32 1, i32 31, i32 5, i32 0>
diff --git a/test/CodeGen/X86/select-with-and-or.ll b/test/CodeGen/X86/select-with-and-or.ll
index f49da8576d188..45e4384d0fa14 100644
--- a/test/CodeGen/X86/select-with-and-or.ll
+++ b/test/CodeGen/X86/select-with-and-or.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
@@ -7,7 +7,6 @@ define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vandps %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ult <4 x float> %a, %b
   %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> zeroinitializer
   ret <4 x i32> %r
@@ -19,7 +18,6 @@ define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vorps %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ult <4 x float> %a, %b
   %r = select <4 x i1> %f, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c
   ret <4 x i32> %r
@@ -31,7 +29,6 @@ define <4 x i32> @test3(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vandps %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ult <4 x float> %a, %b
   %r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> %c
   ret <4 x i32> %r
@@ -43,7 +40,6 @@ define <4 x i32> @test4(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vorps %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ult <4 x float> %a, %b
   %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %r
@@ -54,7 +50,6 @@ define <4 x i32> @test5(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ult <4 x float> %a, %b
   %r = sext <4 x i1> %f to <4 x i32>
   ret <4 x i32> %r
@@ -65,7 +60,6 @@ define <4 x i32> @test6(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %not.f = fcmp oge <4 x float> %a, %b
   %r = sext <4 x i1> %not.f to <4 x i32>
   ret <4 x i32> %r
@@ -77,7 +71,6 @@ define <4 x i32> @test7(<4 x float> %a, <4 x float> %b, <4 x i32>* %p) {
 ; CHECK-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vandps (%rdi), %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ult <4 x float> %a, %b
   %l = load <4 x i32>, <4 x i32>* %p, align 16
   %r = select <4 x i1> %f, <4 x i32> %l, <4 x i32> zeroinitializer
@@ -92,7 +85,6 @@ define <2 x double> @test1f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ogt <2 x double> %a, %b
   %r = select <2 x i1> %f, <2 x double> %c, <2 x double> zeroinitializer
   ret <2 x double> %r
@@ -104,7 +96,6 @@ define <2 x double> @test2f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-NEXT:    vcmplepd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vorpd %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp oge <2 x double> %a, %b
   %r = select <2 x i1> %f, <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>, <2 x double> %c
   ret <2 x double> %r
@@ -116,7 +107,6 @@ define <2 x double> @test3f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-NEXT:    vcmpnltpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp olt <2 x double> %a, %b
   %r = select <2 x i1> %f, <2 x double> zeroinitializer, <2 x double> %c
   ret <2 x double> %r
@@ -128,7 +118,6 @@ define <2 x double> @test4f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-NEXT:    vcmpnlepd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vorpd %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ole <2 x double> %a, %b
   %r = select <2 x i1> %f, <2 x double> %c, <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
   ret <2 x double> %r
@@ -139,7 +128,6 @@ define <2 x double> @test5f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vcmpnlepd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ugt <2 x double> %a, %b
   %r = select <2 x i1> %f, <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>, <2 x double> zeroinitializer
   ret <2 x double> %r
@@ -150,7 +138,6 @@ define <2 x double> @test6f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp ule <2 x double> %a, %b
   %r = select <2 x i1> %f, <2 x double> zeroinitializer, <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
   ret <2 x double> %r
@@ -162,7 +149,6 @@ define <2 x double> @test7f(<2 x double> %a, <2 x double> %b, <2 x double>* %p)
 ; CHECK-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vandpd (%rdi), %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %f = fcmp oeq <2 x double> %a, %b
   %l = load <2 x double>, <2 x double>* %p, align 16
   %r = select <2 x i1> %f, <2 x double> %l, <2 x double> zeroinitializer
diff --git a/test/CodeGen/X86/sext-setcc-self.ll b/test/CodeGen/X86/sext-setcc-self.ll
index e739d21e64e0e..9cbd3d85b3817 100644
--- a/test/CodeGen/X86/sext-setcc-self.ll
+++ b/test/CodeGen/X86/sext-setcc-self.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
 define <4 x i32> @test_ueq(<4 x float> %in) {
@@ -6,7 +6,6 @@ define <4 x i32> @test_ueq(<4 x float> %in) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %t0 = fcmp ueq <4 x float> %in, %in
   %t1 = sext <4 x i1> %t0 to <4 x i32>
   ret <4 x i32> %t1
@@ -17,7 +16,6 @@ define <4 x i32> @test_uge(<4 x float> %in) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %t0 = fcmp uge <4 x float> %in, %in
   %t1 = sext <4 x i1> %t0 to <4 x i32>
   ret <4 x i32> %t1
@@ -28,7 +26,6 @@ define <4 x i32> @test_ule(<4 x float> %in) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %t0 = fcmp ule <4 x float> %in, %in
   %t1 = sext <4 x i1> %t0 to <4 x i32>
   ret <4 x i32> %t1
@@ -39,7 +36,6 @@ define <4 x i32> @test_one(<4 x float> %in) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %t0 = fcmp one <4 x float> %in, %in
   %t1 = sext <4 x i1> %t0 to <4 x i32>
   ret <4 x i32> %t1
@@ -50,7 +46,6 @@ define <4 x i32> @test_ogt(<4 x float> %in) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %t0 = fcmp ogt <4 x float> %in, %in
   %t1 = sext <4 x i1> %t0 to <4 x i32>
   ret <4 x i32> %t1
@@ -61,7 +56,6 @@ define <4 x i32> @test_olt(<4 x float> %in) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-;
   %t0 = fcmp olt <4 x float> %in, %in
   %t1 = sext <4 x i1> %t0 to <4 x i32>
   ret <4 x i32> %t1
diff --git a/test/CodeGen/X86/shift-pcmp.ll b/test/CodeGen/X86/shift-pcmp.ll
index adfd2f143d17d..f509da2674bcb 100644
--- a/test/CodeGen/X86/shift-pcmp.ll
+++ b/test/CodeGen/X86/shift-pcmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -o - -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -o - -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
 
@@ -14,7 +14,6 @@ define <8 x i16> @foo(<8 x i16> %a, <8 x i16> %b) {
 ; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %icmp = icmp eq <8 x i16> %a, %b
   %zext = zext <8 x i1> %icmp to <8 x i16>
   %shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
@@ -34,7 +33,6 @@ define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
 ; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %icmp = icmp eq <8 x i16> %a, %b
   %zext = zext <8 x i1> %icmp to <8 x i16>
   %shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 undef, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index f0dff3b806c53..e2fd63eab30fc 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -1,10 +1,12 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_NOOPT
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNUX32_SINCOS
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
 
-; Combine sin / cos into a single call.
+; Combine sin / cos into a single call unless they may write errno (as
+; captured by readnone attrbiute, controlled by clang -fmath-errno
+; setting).
 ; rdar://13087969
 ; rdar://13599493
 
@@ -15,25 +17,44 @@ entry:
 ; GNU_SINCOS: movss 4(%rsp), %xmm0
 ; GNU_SINCOS: addss (%rsp), %xmm0
 
-; GNUX32_SINCOS-LABEL: test1:
-; GNUX32_SINCOS: callq sincosf
-; GNUX32_SINCOS: movss 4(%esp), %xmm0
-; GNUX32_SINCOS: addss (%esp), %xmm0
-
-; GNU_NOOPT: test1
-; GNU_NOOPT: callq sinf
-; GNU_NOOPT: callq cosf
+; GNU_SINCOS_FASTMATH-LABEL: test1:
+; GNU_SINCOS_FASTMATH: callq sincosf
+; GNU_SINCOS_FASTMATH: movss 4(%{{[re]}}sp), %xmm0
+; GNU_SINCOS_FASTMATH: addss (%{{[re]}}sp), %xmm0
 
 ; OSX_SINCOS-LABEL: test1:
 ; OSX_SINCOS: callq ___sincosf_stret
 ; OSX_SINCOS: movshdup {{.*}} xmm1 = xmm0[1,1,3,3]
 ; OSX_SINCOS: addss %xmm1, %xmm0
 
-; OSX_NOOPT: test1
+; OSX_NOOPT-LABEL: test1:
 ; OSX_NOOPT: callq _sinf
 ; OSX_NOOPT: callq _cosf
-  %call = tail call float @sinf(float %x) nounwind readnone
-  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %call = tail call float @sinf(float %x) readnone
+  %call1 = tail call float @cosf(float %x) readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define float @test1_errno(float %x) nounwind {
+entry:
+; GNU_SINCOS-LABEL: test1_errno:
+; GNU_SINCOS: callq sinf
+; GNU_SINCOS: callq cosf
+
+; GNU_SINCOS_FASTMATH-LABEL: test1_errno:
+; GNU_SINCOS_FASTMATH: callq sinf
+; GNU_SINCOS_FASTMATH: callq cosf
+
+; OSX_SINCOS-LABEL: test1_errno:
+; OSX_SINCOS: callq _sinf
+; OSX_SINCOS: callq _cosf
+
+; OSX_NOOPT-LABEL: test1_errno:
+; OSX_NOOPT: callq _sinf
+; OSX_NOOPT: callq _cosf
+  %call = tail call float @sinf(float %x)
+  %call1 = tail call float @cosf(float %x)
   %add = fadd float %call, %call1
   ret float %add
 }
@@ -45,24 +66,43 @@ entry:
 ; GNU_SINCOS: movsd 16(%rsp), %xmm0
 ; GNU_SINCOS: addsd 8(%rsp), %xmm0
 
-; GNUX32_SINCOS-LABEL: test2:
-; GNUX32_SINCOS: callq sincos
-; GNUX32_SINCOS: movsd 16(%esp), %xmm0
-; GNUX32_SINCOS: addsd 8(%esp), %xmm0
-
-; GNU_NOOPT: test2:
-; GNU_NOOPT: callq sin
-; GNU_NOOPT: callq cos
+; GNU_SINCOS_FASTMATH-LABEL: test2:
+; GNU_SINCOS_FASTMATH: callq sincos
+; GNU_SINCOS_FASTMATH: movsd 16(%{{[re]}}sp), %xmm0
+; GNU_SINCOS_FASTMATH: addsd 8(%{{[re]}}sp), %xmm0
 
 ; OSX_SINCOS-LABEL: test2:
 ; OSX_SINCOS: callq ___sincos_stret
 ; OSX_SINCOS: addsd %xmm1, %xmm0
 
-; OSX_NOOPT: test2
+; OSX_NOOPT-LABEL: test2:
+; OSX_NOOPT: callq _sin
+; OSX_NOOPT: callq _cos
+  %call = tail call double @sin(double %x) readnone
+  %call1 = tail call double @cos(double %x) readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+define double @test2_errno(double %x) nounwind {
+entry:
+; GNU_SINCOS-LABEL: test2_errno:
+; GNU_SINCOS: callq sin
+; GNU_SINCOS: callq cos
+
+; GNU_SINCOS_FASTMATH-LABEL: test2_errno:
+; GNU_SINCOS_FASTMATH: callq sin
+; GNU_SINCOS_FASTMATH: callq cos
+
+; OSX_SINCOS-LABEL: test2_errno:
+; OSX_SINCOS: callq _sin
+; OSX_SINCOS: callq _cos
+
+; OSX_NOOPT-LABEL: test2_errno:
 ; OSX_NOOPT: callq _sin
 ; OSX_NOOPT: callq _cos
-  %call = tail call double @sin(double %x) nounwind readnone
-  %call1 = tail call double @cos(double %x) nounwind readnone
+  %call = tail call double @sin(double %x)
+  %call1 = tail call double @cos(double %x)
   %add = fadd double %call, %call1
   ret double %add
 }
@@ -70,29 +110,40 @@ entry:
 define x86_fp80 @test3(x86_fp80 %x) nounwind {
 entry:
 ; GNU_SINCOS-LABEL: test3:
+; GNU_SINCOS: callq sincosl
+; GNU_SINCOS: fldt 16(%rsp)
+; GNU_SINCOS: fldt 32(%rsp)
+; GNU_SINCOS: faddp %st(1)
+
+; GNU_SINCOS_FASTMATH-LABEL: test3:
+; GNU_SINCOS_FASTMATH: fsin
+; GNU_SINCOS_FASTMATH: fcos
+; GNU_SINCOS_FASTMATH: faddp %st(1)
+; GNU_SINCOS_FASTMATH: ret
+  %call = tail call x86_fp80 @sinl(x86_fp80 %x) readnone
+  %call1 = tail call x86_fp80 @cosl(x86_fp80 %x) readnone
+  %add = fadd x86_fp80 %call, %call1
+  ret x86_fp80 %add
+}
+
+define x86_fp80 @test3_errno(x86_fp80 %x) nounwind {
+entry:
+; GNU_SINCOS-LABEL: test3_errno:
 ; GNU_SINCOS: callq sinl
 ; GNU_SINCOS: callq cosl
-; GNU_SINCOS: ret
 
-; GNUX32_SINCOS-LABEL: test3:
-; GNUX32_SINCOS: callq sinl
-; GNUX32_SINCOS: callq cosl
-; GNUX32_SINCOS: ret
-
-; GNU_NOOPT: test3:
-; GNU_NOOPT: callq sinl
-; GNU_NOOPT: callq cosl
-
-  %call = tail call x86_fp80 @sinl(x86_fp80 %x) nounwind
-  %call1 = tail call x86_fp80 @cosl(x86_fp80 %x) nounwind
+; GNU_SINCOS_FASTMATH-LABEL: test3_errno:
+; GNU_SINCOS_FASTMATH: callq sinl
+; GNU_SINCOS_FASTMATH: callq cosl
+  %call = tail call x86_fp80 @sinl(x86_fp80 %x)
+  %call1 = tail call x86_fp80 @cosl(x86_fp80 %x)
   %add = fadd x86_fp80 %call, %call1
   ret x86_fp80 %add
 }
 
-declare float  @sinf(float) readonly
-declare double @sin(double) readonly
-declare float @cosf(float) readonly
-declare double @cos(double) readonly
-
+declare float  @sinf(float)
+declare double @sin(double)
+declare float @cosf(float)
+declare double @cos(double)
 declare x86_fp80 @sinl(x86_fp80)
 declare x86_fp80 @cosl(x86_fp80)
diff --git a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
index 77497d38c897d..2ecba887f7cbf 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
 
 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index 26af37e30295b..9bda90a230239 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse4.1 | FileCheck %s
 
 ; This test works just like the non-upgrade one except that it only checks
@@ -230,7 +230,6 @@ define <16 x i8> @max_epi8(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pmaxsb %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
   ret <16 x i8> %res
 }
@@ -241,7 +240,6 @@ define <16 x i8> @min_epi8(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pminsb %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
   ret <16 x i8> %res
 }
@@ -252,7 +250,6 @@ define <8 x i16> @max_epu16(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pmaxuw %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
 }
@@ -263,7 +260,6 @@ define <8 x i16> @min_epu16(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pminuw %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
 }
@@ -274,7 +270,6 @@ define <4 x i32> @max_epi32(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pmaxsd %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
   ret <4 x i32> %res
 }
@@ -285,7 +280,6 @@ define <4 x i32> @min_epi32(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pminsd %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
   ret <4 x i32> %res
 }
@@ -296,7 +290,6 @@ define <4 x i32> @max_epu32(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pmaxud %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
   ret <4 x i32> %res
 }
@@ -307,7 +300,6 @@ define <4 x i32> @min_epu32(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pminud %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
   ret <4 x i32> %res
 }
diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll
index 38e19efb71326..362e656b4f220 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -450,6 +450,24 @@ define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %ma
   ret <64 x i8> %4
 }
 
+define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpconflictd
+  ;CHECK:       vpconflictd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a0, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpconflictq
+  ;CHECK:       vpconflictq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a0, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
+
 define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pcmpeqb
   ;CHECK:       vpcmpeqb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
@@ -486,6 +504,61 @@ define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) {
   ret i32 %3
 }
 
+define i16 @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pcmpeqd_mask
+  ;CHECK:       vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <16 x i32>, <16 x i32>* %a2
+  %3 = add <16 x i32> %a1, %2
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = icmp eq <16 x i32> %3, %a0
+  %6 = and <16 x i1> %4, %5
+  %7 = bitcast <16 x i1> %6 to i16
+  ret i16 %7
+}
+
+define i16 @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted
+  ;CHECK:       vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <16 x i32>, <16 x i32>* %a2
+  %3 = add <16 x i32> %a1, %2
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = icmp eq <16 x i32> %a0, %3
+  %6 = and <16 x i1> %4, %5
+  %7 = bitcast <16 x i1> %6 to i16
+  ret i16 %7
+}
+
+define i16 @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pcmpled_mask
+  ;CHECK:       vpcmpled {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <16 x i32>, <16 x i32>* %a2
+  %3 = add <16 x i32> %a1, %2
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = icmp sge <16 x i32> %a0, %3
+  %6 = and <16 x i1> %4, %5
+  %7 = bitcast <16 x i1> %6 to i16
+  ret i16 %7
+}
+
+define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pcmpleud
+  ;CHECK:       vpcmpleud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x i32>, <16 x i32>* %a2
+  %3 = add <16 x i32> %a1, %2
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = icmp uge <16 x i32> %a0, %3
+  %6 = and <16 x i1> %5, %4
+  %7 = bitcast <16 x i1> %6 to i16
+  ret i16 %7
+}
+
 define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_permbvar
   ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
@@ -740,6 +813,24 @@ define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
   ret <8 x i16> %2
 }
 
+define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vplzcntd
+  ;CHECK:       vplzcntd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0)
+  ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>) nounwind readonly
+
+define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vplzcntq
+  ;CHECK:       vplzcntq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>) nounwind readnone
+
 define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pmaddubsw_zmm
   ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
index 7ce798f778a3a..26e97ea4e599a 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512vbmi < %s | FileCheck %s
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -81,6 +81,42 @@ define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
 }
 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
 
+define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpconflictd
+  ;CHECK:       vpconflictd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32> %a0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpconflictd_ymm
+  ;CHECK:       vpconflictd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32> %a0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpconflictq
+  ;CHECK:       vpconflictq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64> %a0, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpconflictq_ymm
+  ;CHECK:       vpconflictq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64> %a0, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
+
 define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x4
   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
@@ -708,6 +744,42 @@ define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
 }
 declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) nounwind readonly
 
+define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vplzcntd
+  ;CHECK:       vplzcntd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>) nounwind readonly
+
+define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vplzcntd_ymm
+  ;CHECK:       vplzcntd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>) nounwind readonly
+
+define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vplzcntq
+  ;CHECK:       vplzcntq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vplzcntq_ymm
+  ;CHECK:       vplzcntq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>) nounwind readnone
+
 define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pmaddubsw
   ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
diff --git a/test/CodeGen/X86/statepoint-live-in.ll b/test/CodeGen/X86/statepoint-live-in.ll
index aaa4d7c8422a9..0179d37ad4e16 100644
--- a/test/CodeGen/X86/statepoint-live-in.ll
+++ b/test/CodeGen/X86/statepoint-live-in.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -O3 < %s | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
@@ -16,7 +16,6 @@ define void @test1(i32 %a) gc "statepoint-example" {
 ; CHECK-NEXT:  Ltmp0:
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-;
 entry:
 ; We expect the argument to be passed in an extra register to bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
@@ -49,7 +48,6 @@ define void @test2(i32 %a, i32 %b) gc "statepoint-example" {
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
-;
 entry:
 ; Because the first call clobbers esi, we have to move the values into
 ; new registers.  Note that they stay in the registers for both calls.
@@ -68,7 +66,6 @@ define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:  Ltmp3:
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-;
 entry:
 ; We directly reference the argument slot
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
@@ -89,7 +86,6 @@ define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:  Ltmp4:
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
-;
 entry:
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
   ret void
@@ -111,7 +107,6 @@ define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-ex
 ; CHECK-NEXT:    movq (%rsp), %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
-;
 entry:
   %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p, i32 addrspace(1)* %p)
   %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 9, i32 9)
@@ -139,7 +134,6 @@ define void @test6(i32 %a) gc "statepoint-example" {
 ; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
-;
 entry:
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
diff --git a/test/CodeGen/X86/swifterror.ll b/test/CodeGen/X86/swifterror.ll
index 5704d1919988f..1ecd33743d217 100644
--- a/test/CodeGen/X86/swifterror.ll
+++ b/test/CodeGen/X86/swifterror.ll
@@ -712,3 +712,111 @@ trueBB:
 falseBB:
   ret void
 }
+
+
+declare swiftcc void @foo2(%swift_error** swifterror)
+
+; Make sure we properly assign registers during fast-isel.
+; CHECK-O0-LABEL: testAssign
+; CHECK-O0:        pushq   %r12
+; CHECK-O0:        xorl    [[ZERO:%[a-z0-9]+]], [[ZERO]]
+; CHECK-O0:        movl    [[ZERO]], %r12d
+; CHECK-O0:        callq   _foo2
+; CHECK-O0:        movq    %r12, [[SLOT:[-a-z0-9\(\)\%]*]]
+;
+; CHECK-O0:        movq    [[SLOT]], %rax
+; CHECK-O0:        popq    %r12
+; CHECK-O0:        retq
+
+; CHECK-APPLE-LABEL: testAssign
+; CHECK-APPLE:        pushq   %r12
+; CHECK-APPLE:        xorl    %r12d, %r12d
+; CHECK-APPLE:        callq   _foo2
+; CHECK-APPLE:        movq    %r12, %rax
+; CHECK-APPLE:        popq    %r12
+; CHECK-APPLE:        retq
+
+define swiftcc %swift_error* @testAssign(i8* %error_ref) {
+entry:
+  %error_ptr = alloca swifterror %swift_error*
+  store %swift_error* null, %swift_error** %error_ptr
+  call swiftcc void @foo2(%swift_error** swifterror %error_ptr)
+  br label %a
+
+a:
+  %error = load %swift_error*, %swift_error** %error_ptr
+  ret %swift_error* %error
+}
+
+; CHECK-O0-LABEL: testAssign2
+; CHECK-O0:        movq    %r12, {{.*}}
+; CHECK-O0:        movq    %r12, [[SLOT:[-a-z0-9\(\)\%]*]]
+; CHECK-O0:        jmp
+; CHECK-O0:        movq    [[SLOT]], %rax
+; CHECK-O0:        movq    %rax, [[SLOT2:[-a-z0-9\(\)\%]*]]
+; CHECK-O0:        movq    [[SLOT2]], %r12
+; CHECK-O0:        retq
+
+; CHECK-APPLE-LABEL: testAssign2
+; CHECK-APPLE:        movq    %r12, %rax
+; CHECK-APPLE:        retq
+define swiftcc %swift_error* @testAssign2(i8* %error_ref, %swift_error** swifterror %err) {
+entry:
+  br label %a
+
+a:
+  %error = load %swift_error*, %swift_error** %err
+  ret %swift_error* %error
+}
+
+; CHECK-O0-LABEL: testAssign3
+; CHECK-O0:        callq   _foo2
+; CHECK-O0:        movq    %r12, [[SLOT:[-a-z0-9\(\)\%]*]]
+; CHECK-O0:        movq    [[SLOT]], %rax
+; CHECK-O0:        movq    %rax, [[SLOT2:[-a-z0-9\(\)\%]*]]
+; CHECK-O0:        movq    [[SLOT2]], %r12
+; CHECK-O0:        addq    $24, %rsp
+; CHECK-O0:        retq
+
+; CHECK-APPLE-LABEL: testAssign3
+; CHECK-APPLE:         callq   _foo2
+; CHECK-APPLE:         movq    %r12, %rax
+; CHECK-APPLE:         retq
+
+define swiftcc %swift_error* @testAssign3(i8* %error_ref, %swift_error** swifterror %err) {
+entry:
+  call swiftcc void @foo2(%swift_error** swifterror %err)
+  br label %a
+
+a:
+  %error = load %swift_error*, %swift_error** %err
+  ret %swift_error* %error
+}
+
+
+; CHECK-O0-LABEL: testAssign4
+; CHECK-O0:        callq   _foo2
+; CHECK-O0:        xorl    %ecx, %ecx
+; CHECK-O0:        movl    %ecx, %eax
+; CHECK-O0:        movq    %rax, [[SLOT:[-a-z0-9\(\)\%]*]]
+; CHECK-O0:        movq    [[SLOT]], %rax
+; CHECK-O0:        movq    %rax, [[SLOT2:[-a-z0-9\(\)\%]*]]
+; CHECK-O0:        movq    [[SLOT2]], %r12
+; CHECK-O0:        retq
+
+; CHECK-APPLE-LABEL: testAssign4
+; CHECK-APPLE:        callq   _foo2
+; CHECK-APPLE:        xorl    %eax, %eax
+; CHECK-APPLE:        xorl    %r12d, %r12d
+; CHECK-APPLE:        retq
+
+define swiftcc %swift_error* @testAssign4(i8* %error_ref, %swift_error** swifterror %err) {
+entry:
+  call swiftcc void @foo2(%swift_error** swifterror %err)
+  store %swift_error* null, %swift_error** %err
+  br label %a
+
+a:
+  %error = load %swift_error*, %swift_error** %err
+  ret %swift_error* %error
+}
diff --git a/test/CodeGen/X86/urem-i8-constant.ll b/test/CodeGen/X86/urem-i8-constant.ll
index 45717f985c233..2a659b20de8fb 100644
--- a/test/CodeGen/X86/urem-i8-constant.ll
+++ b/test/CodeGen/X86/urem-i8-constant.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
 
 define i8 @foo(i8 %tmp325) {
@@ -14,7 +14,6 @@ define i8 @foo(i8 %tmp325) {
 ; CHECK-NEXT:    subb %al, %cl
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    retl
-;
   %t546 = urem i8 %tmp325, 37
   ret i8 %t546
 }
diff --git a/test/CodeGen/X86/urem-power-of-two.ll b/test/CodeGen/X86/urem-power-of-two.ll
index 469c573443ea6..1b56c87aad5f8 100644
--- a/test/CodeGen/X86/urem-power-of-two.ll
+++ b/test/CodeGen/X86/urem-power-of-two.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 ; The easy case: a constant power-of-2 divisor.
@@ -9,7 +9,6 @@ define i64 @const_pow_2(i64 %x) {
 ; CHECK-NEXT:    andl $31, %edi
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
-;
   %urem = urem i64 %x, 32
   ret i64 %urem
 }
@@ -25,7 +24,6 @@ define i25 @shift_left_pow_2(i25 %x, i25 %y) {
 ; CHECK-NEXT:    addl $33554431, %eax # imm = 0x1FFFFFF
 ; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %shl = shl i25 1, %y
   %urem = urem i25 %x, %shl
   ret i25 %urem
@@ -43,7 +41,6 @@ define i16 @shift_right_pow_2(i16 %x, i16 %y) {
 ; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
-;
   %shr = lshr i16 -32768, %y
   %urem = urem i16 %x, %shr
   ret i16 %urem
@@ -61,7 +58,6 @@ define i8 @and_pow_2(i8 %x, i8 %y) {
 ; CHECK-NEXT:    movzbl %ah, %eax # NOREX
 ; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
-;
   %and = and i8 %y, 4
   %urem = urem i8 %x, %and
   ret i8 %urem
@@ -74,7 +70,6 @@ define <4 x i32> @vec_const_pow_2(<4 x i32> %x) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %urem = urem <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
   ret <4 x i32> %urem
 }
diff --git a/test/CodeGen/X86/vec3.ll b/test/CodeGen/X86/vec3.ll
index 8eaf9f4f48e43..e9c47ffd21c6d 100644
--- a/test/CodeGen/X86/vec3.ll
+++ b/test/CodeGen/X86/vec3.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse | FileCheck %s
 
 define <3 x float> @fadd(<3 x float> %v, float %d) {
@@ -7,7 +7,6 @@ define <3 x float> @fadd(<3 x float> %v, float %d) {
 ; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,3]
 ; CHECK-NEXT:    addps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %ins = insertelement <3 x float> undef, float %d, i32 0
   %splat = shufflevector <3 x float> %ins, <3 x float> undef, <3 x i32> zeroinitializer
   %add = fadd <3 x float> %splat, %v
@@ -23,7 +22,6 @@ define <3 x float> @fdiv(<3 x float> %v, float %d) {
 ; CHECK-NEXT:    divps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %ins = insertelement <3 x float> undef, float %d, i32 0
   %splat = shufflevector <3 x float> %ins, <3 x float> undef, <3 x i32> zeroinitializer
   %div = fdiv <3 x float> %splat, %v
diff --git a/test/CodeGen/X86/vector-compare-combines.ll b/test/CodeGen/X86/vector-compare-combines.ll
index c25474d92f9cd..bd7cbfb4bac0e 100644
--- a/test/CodeGen/X86/vector-compare-combines.ll
+++ b/test/CodeGen/X86/vector-compare-combines.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 
@@ -17,7 +17,6 @@ define <4 x i32> @PR27924_cmpeq(<4 x i32> %a, <4 x i32> %b) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %cmp = icmp sgt <4 x i32> %a, %b
   %max = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
   %sse_max = tail call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a, <4 x i32> %b)
@@ -36,7 +35,6 @@ define <4 x i32> @PR27924_cmpgt(<4 x i32> %a, <4 x i32> %b) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %cmp = icmp sgt <4 x i32> %a, %b
   %max = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
   %sse_max = tail call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a, <4 x i32> %b)
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index fad5586dd77cd..d34728df29b74 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1559,6 +1559,24 @@ define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_z
   ret <16 x i16> %shuffle
 }
 
+define <16 x i16> @shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11,28,29,30,31,18,19,20,21,30,31,16,17,24,25,26,27]
+; AVX2OR512VL-NEXT:    retq
+  %1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 6, i32 7, i32 1, i32 2, i32 7, i32 0, i32 4, i32 5, i32 14, i32 15, i32 9, i32 10, i32 15, i32 8, i32 12, i32 13>
+  ret <16 x i16> %1
+}
+
 ;
 ; Shuffle to logical bit shifts
 ;
diff --git a/test/CodeGen/X86/vzero-excess.ll b/test/CodeGen/X86/vzero-excess.ll
index 0ed90741b61eb..9ddafec651824 100644
--- a/test/CodeGen/X86/vzero-excess.ll
+++ b/test/CodeGen/X86/vzero-excess.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 ; In the following 4 tests, the existing call to VZU/VZA ensures clean state before
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 74214aa1b8b74..ec8bce1b43cc3 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -9,8 +9,8 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
 ; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd 64(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
 ; AVX-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
@@ -37,8 +37,8 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
 ; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
 ; AVX-NEXT:    vmovupd 64(%rdi), %ymm2
 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
@@ -53,25 +53,15 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
 }
 
 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
-; AVX1-LABEL: load_factorf64_1:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
-; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX1-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_factorf64_1:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovupd (%rdi), %ymm0
-; AVX2-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX2-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
-; AVX2-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
-; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: load_factorf64_1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovupd (%rdi), %ymm0
+; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -86,8 +76,8 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
 ; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
 ; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
 ; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
@@ -113,8 +103,8 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
 ; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
 ; AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm5
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]